In [37]:
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.datasets import fetch_lfw_people
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

## 1: Face Recognition, but not evil this time

Using the faces dataset in:

```
from sklearn.datasets import fetch_lfw_people
faces = fetch_lfw_people(min_faces_per_person=60)
```

If you use the `faces.target` and `faces.target_names` attributes, you can build a facial recognition algorithm.

Use sklearn **gridsearch** (or an equivalent, like random search) to optimize the model for accuracy. Try both a SVM-based classifier and a logistic regression based classifier (with a feature pipeline of your choice) to get the best model. You should have at least 80% accuracy.

In [None]:
# Recieved help from Javad

In [None]:
faces = fetch_lfw_people(min_faces_per_person=60)

In [None]:
pca = PCA(n_components = 200, random_state=2, whiten=True)

In [None]:
svc = SVC(kernel='rbf', class_weight="balanced")

In [None]:
svc_pipeline = make_pipeline(pca, svc)

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(faces.data, faces.target, random_state=42)

In [None]:
svc_param_grid = {'svc__C': [1, 5, 10, 50],
              'svc__gamma': [0.0001, 0.0005, 0.001, 0.005]}

In [None]:
svc_grid = GridSearchCV(svc_pipeline, svc_param_grid)

In [None]:
svc_grid.fit(Xtrain, ytrain)
svc_grid.best_params_

In [None]:
svc_ypred = svc_grid.predict(Xtest)

In [None]:
accuracy_score(ytest, svc_ypred)

In [None]:
lr = LogisticRegression()
lr_pipeline = make_pipeline(pca, lr)

In [None]:
lr_param_grid = {"logisticregression__C": [0.25, 0.5, 1, 5, 10],
                "logisticregression__penalty": ["l2","none"]}

In [None]:
lr_grid = GridSearchCV(lr_pipeline, lr_param_grid)

In [None]:
lr_grid.fit(Xtrain, ytrain)
lr_grid.best_params_

In [None]:
lr_ypred = lr_grid.predict(Xtest)

In [None]:
accuracy_score(ytest, lr_ypred)

# 2: Bag of Words, Bag of Popcorn

By this point, you are ready for the [Bag of Words, Bag of Popcorn](https://www.kaggle.com/c/word2vec-nlp-tutorial/data) competition. 

Use NLP feature pre-processing (using, SKLearn, Gensim, Spacy or Hugginface) to build the best classifier you can. Use a  feature pipeline, and gridsearch for your final model.

A succesful project should get 90% or more on a **holdout** dataset you kept for yourself.

In [None]:
# Kaggle tutorial start

In [None]:
# part 1 start

In [2]:
train = pd.read_csv("data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

In [3]:
def review_to_words(raw_review):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, and return the result.
    return( " ".join( meaningful_words )) 

In [4]:
# Get the number of reviews based on the dataframe column size
num_reviews = train["review"].size

# Initialize an empty list to hold the clean reviews
clean_train_reviews = []

# Loop over each review; create an index i that goes from 0 to the length of the movie review list 
for i in range(num_reviews):
    
    clean_train_reviews.append(review_to_words(train["review"][i]))

In [51]:
# countvectorizer = sklearn bag of words tool
vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 5000)

# max_features is set to 5k according to kaggle tutorial to select the top 5k most popular words
# countvectorizer also has stop_words but kaggle tutorial did it step by step to explain it properly (it worked)

In [52]:
train_data_features = vectorizer.fit_transform(clean_train_reviews)

In [53]:
train_data_features = train_data_features.toarray()

In [69]:
# all 25k reviews but now with 5k features(one for each word)
train_data_features.shape

(25000, 5000)

In [None]:
# forest = RandomForestClassifier(n_estimators = 100)

# forest = forest.fit(X_train, y_train)

# part 1 end
# Kaggle tutorial end

In [87]:
# Received help from Jasleen beyond this point

In [83]:
X_train, X_test, y_train, y_test = train_test_split(train_data_features, train["sentiment"], test_size=0.2, random_state=0)

In [84]:
# random forest time?
forest = RandomForestClassifier(n_estimators = 100)

forest = forest.fit(X_train, y_train)

In [85]:
y_pred = forest.predict(X_test)

In [86]:
accuracy_score(y_test, y_pred)

0.844

## Resubmission

In [14]:
df = pd.read_csv("data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

In [None]:
# cleanup start

In [6]:
def review_to_words(raw_review):
    review_text = BeautifulSoup(raw_review).get_text() 
   
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 

    words = letters_only.lower().split()                             
    
    stops = set(stopwords.words("english"))                  
    
    meaningful_words = [w for w in words if not w in stops]   
    
    return( " ".join( meaningful_words )) 

In [7]:
num_reviews = train["review"].size

clean_train_reviews = []

for i in range(num_reviews):
    
    clean_train_reviews.append(review_to_words(train["review"][i]))

In [None]:
# cleanup end

In [11]:
vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 5000)
forest = RandomForestClassifier()

In [22]:
pipe = Pipeline([("vec", vectorizer), ("forest", forest)])

In [23]:
X_train, X_test, y_train, y_test = train_test_split(clean_train_reviews, df["sentiment"], test_size=0.2, random_state=42)

In [24]:
grid_params = {"forest__n_estimators": [10, 50, 100, 150, 200]}

In [25]:
grid = GridSearchCV(pipe, grid_params, cv=5)

In [26]:
grid.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vec',
                                        CountVectorizer(max_features=5000)),
                                       ('forest', RandomForestClassifier())]),
             param_grid={'forest__n_estimators': [10, 50, 100, 150, 200]})

In [27]:
grid.best_params_

{'forest__n_estimators': 150}

In [31]:
y_pred = grid.predict(X_test)

In [38]:
accuracy_score(y_test, y_pred)

0.8492