In [105]:
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn import svm
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [106]:
train = pd.read_csv("train.csv")

In [107]:
train.head(5)

Unnamed: 0,label,text
0,1,"Henry Thomas showed a restraint, even when the..."
1,1,"This movie starts out brisk, has some slow mom..."
2,1,Castle of Blood is a good example of the quali...
3,1,I viewed the movie together with a homophobic ...
4,1,"The ""Men in White"" movie is definitely one of ..."


In [108]:
test = pd.read_csv("test.csv")

In [109]:
test.head(5)

Unnamed: 0,Id,text
0,0,I cannot believe I actually sat through the wh...
1,1,I saw this one remastered on DVD. It had a big...
2,2,"Irrespective of the accuracy of facts, Bandit ..."
3,3,"Significant Spoilers! This is a sick, disturbi..."
4,4,If there are people that don't like this movie...


There are several ways to start with. First, you should think of a way to transform the text of a review to a feature vector, such that each dimension represents a word and the value represents the weight of that word in the review. You can also try different TF-IDF tricks to adjust the weightings. You may also consider adding bi-gram features as well. the `sklearn` package offers some ways to extract features from text, so let's play with one of them.

In [110]:
cont_vect = CountVectorizer()

In [111]:
X_train = cont_vect.fit_transform(train.text)

In [112]:
X_train

<10000x51702 sparse matrix of type '<class 'numpy.int64'>'
	with 1381868 stored elements in Compressed Sparse Row format>

As the name suggest, we are transforming the reviews in the training set as a 10000 x 51704 matrix. THe number 51704 indicates that there are 51704 unique word in the training reviews. We can also limit the number of features in the matrix by setting the `max_features` when initiating the CountVectorizer.

In [113]:
X_test = cont_vect.transform(test.text)

In [114]:
y_train = train.label

You may notice that we are no longer constructing X_train, X_test, y_train, y_test using train_test_split. Obviously, the train-test split is now provided by a third-party and the y_test is hidden from you.

## Building a Dummy Classifier

In [115]:
param_grid = {'C':[.001,.01,.1,1,10]}
grid = GridSearchCV(LogisticRegression(max_iter = 1000), param_grid, cv=5)
grid.fit(X_train, y_train)

print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)
print("Best estimator: ", grid.best_estimator_)

Best cross-validation score: 0.87
Best parameters:  {'C': 0.1}
Best estimator:  LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


In [116]:
lr = LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
lr.score(X_train, y_train)

0.9868

In [118]:
lr_prediction = lr.predict(X_test)

In [119]:
pd.DataFrame({"Id": test.Id, "Category": lr_prediction}).to_csv("LR_prediction.csv", index=False)