### Loading  Training and Testing Dataset

In [2]:
import pandas as pd
import numpy as np

Training:
- polarity : Target 2:positive, 1:neutral, 0:negative
- words : preprocessed sentences
- type : the tags of the words from lemmatizing 

In [3]:
training = pd.read_csv('./clean_data/training_data.csv', encoding='utf8')

In [4]:
training = training.dropna()
training.head()

Unnamed: 0,words,type,polarity
0,judge previous post used be good place not longer,VB JJ NN VB VB JJ NN RB JJ,0.0
1,be arrive noon place be empty staff act be imp...,VB VB NN NN VB JJ NN VB VB VB VB RB JJ,0.0
2,never bring complimentary noodle ignore repeat...,RB VB JJ NN VB JJ NN NN VB NN NN,0.0
3,food be lousy too sweet too salty portion tiny,NN VB JJ RB JJ RB JJ NN JJ,0.0
4,food be lousy too sweet too salty portion tiny,NN VB JJ RB JJ RB JJ NN JJ,0.0


### Train Test Split

In [27]:
from sklearn.model_selection import train_test_split

X = training.words
y = training.polarity
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

### Vectorizer

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

tvec = TfidfVectorizer(max_features=900, norm=None)
tvec.fit(X_train)

#transformed matrix of words
X_train_tvec = tvec.transform(X_train)
X_test_tvec = tvec.transform(X_test)

### Finding an Estimator
- Choosing an estimator which will perform at its optimum so can be tuned further.

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [29]:
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          KNeighborsClassifier(), 
          MultinomialNB(), 
          SGDClassifier()]

for model in models:
    print model
    
    model.fit(X_train_tvec, y_train)
    y_pred = model.predict(X_test_tvec)
    score = accuracy_score(y_test, y_pred)
    print score
    print '_'*70

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.7903225806451613
______________________________________________________________________
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
0.7641129032258065
______________________________________________________________________
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_

With limited time I have chosen a localised method to choose my classifier

# GridSearch using Tfidf optimal parameters 
- Logistic Regression
- Random Forest

Grid Search CV Logistic Regression
Logistic regression is a very fast classifier.
I am using grid search to find best hyperparameters.
random_state=1 lr param fixed random... for l1 l2 ???????

For integer/None inputs, if the estimator is a classifier and y is either binary or multiclass, StratifiedKFold is used. In all other cases, KFold is used??? arguement?


In [30]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict

### Logistic Regression

In [32]:
lr = LogisticRegression(random_state=1)
lr_params = {'penalty': ['l1','l2'],
          'solver':['liblinear'],
          'C': np.logspace(-10,10,21)}

lr_grid = GridSearchCV(lr, param_grid=lr_params, cv=5, n_jobs=-1, verbose=2)

# fit with the tranformed tfidf matrix as X
lr_grid.fit(X_train_tvec, y_train)

Fitting 5 folds for each of 42 candidates, totalling 210 fits


[Parallel(n_jobs=-1)]: Done 210 out of 210 | elapsed:   17.3s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'penalty': ['l1', 'l2'], 'C': array([1.e-10, 1.e-09, 1.e-08, 1.e-07, 1.e-06, 1.e-05, 1.e-04, 1.e-03,
       1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03, 1.e+04, 1.e+05,
       1.e+06, 1.e+07, 1.e+08, 1.e+09, 1.e+10]), 'solver': ['liblinear']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [45]:
# best score on the training data:
print lr_grid.best_score_

# assign the best estimator to a variable:
best_lr = lr_grid.best_estimator_

# best parameters on the training data:
lr_grid.best_params_

0.8107173725151253


{'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}

In [None]:
l2 is ridgeeeeeeee
# Lasso was chosen: this indicates that maybe unimportant (noise) variables
# is more of an issue in our data than multicollinearity.

In [49]:
# Score it on the testing data:
best_lr.score(X_test_tvec, y_test)

0.7883064516129032

# Evaluation 
X_train_tvec, y_train

lr_y_pred

In [77]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [64]:
print "Number of classification errors:", np.abs(y_pred_lr - y_test).sum()

Number of classification errors: 193.0


In [93]:
y_pred_lr = best_lr.predict(X_test_tvec)
y_pp_lr = best_lr.predict_proba(X_test_tvec)


# Get the predicted probability vector and explicitly name the columns:
Y_pp = pd.DataFrame(knn.predict_proba(X_test_tvec), columns=['class_0_pp','class_1_pp'])
Y_pp.head(10)

In [69]:
confusion_matrix(y_test,y_pred_lr)

array([[ 60,   2,  59],
       [  4,   2,  10],
       [ 29,   1, 329]])

In [81]:
print accuracy_score(y_test,y_pred_lr) # same as above accuracy
# print precision_score(y_test,y_pred_lr)
# print recall_score(y_test,y_pred_lr)
print f1_score(y_test==1, y_pred_lr==1)


0.7883064516129032
0.19047619047619047


In [83]:
print classification_report(y_test,y_pred_lr)

             precision    recall  f1-score   support

        0.0       0.65      0.50      0.56       121
        1.0       0.40      0.12      0.19        16
        2.0       0.83      0.92      0.87       359

avg / total       0.77      0.79      0.77       496



# Let's analyse the features importances

In [92]:
coef_df = pd.DataFrame({
        'coef':best_lr.coef_[0]})
coef_df['abs_coef'] = np.abs(coef_df.coef)
# sort by absolute value of coefficient (magnitude)
coef_df.sort_values('abs_coef', ascending=False, inplace=True)

coef_df.head()

Unnamed: 0,coef,abs_coef
471,0.556775,0.556775
343,0.47466,0.47466
322,-0.468406,0.468406
609,-0.456145,0.456145
210,-0.451457,0.451457


In [91]:
# Show non-zero coefs and predictors
# coef_df[coef_df.coef != 0]
len(coef_df[coef_df.coef != 0])

900

## Testing data
    - polarity : Target 2:positive, 1:neutral, 0:negative
    - words : preprocessed sentences
    - type : the tags of the words from lemmatizing 


TFDIF was fitted on the training data which will be used to transfomation the words in testing into a sparse matrix
Logistic Regressions best parameters which were fit for the training data will then predict sentiment (y_hat) for the transformed testing data.



---- logistic regression gave a good score in classifying the predictors but will try different classifiers to determine best accuracy score 


In [43]:
testing = pd.read_csv('./clean_data/testing_data.csv', encoding='utf8')
testing = testing.dropna()
X = testing.words

In [47]:
# transform the testing data
X_mat = tvec.transform(X)

# predictions and predictive probabilities
y_hat = best_lr.predict(X_mat)
y_hat_pp = best_lr.predict_proba(X_mat)
y_hat_pp

# probability in class 1 class 2 class 3 

array([[0.00474512, 0.00256113, 0.99269375],
       [0.48790896, 0.09569509, 0.41639595],
       [0.78429316, 0.03728551, 0.17842134],
       ...,
       [0.28897808, 0.0186247 , 0.69239723],
       [0.28897808, 0.0186247 , 0.69239723],
       [0.28897808, 0.0186247 , 0.69239723]])

# NOTES

https://stackoverflow.com/questions/40679883/scikit-learn-how-to-include-others-features-after-performed-fit-and-transform-o

Logistic Regression performs best out of the two classifiers. 

In [None]:
- boostrap
- switch vectoriser and train test split 
- chek models again!!




later, additonal work, 
- clustering
whats similar to logistic regression
-neural networks
from sklearn.neural_network import MLPClassifier 