# Intro to NLP Lab

In this lab, you'll be classifying randomly selected tweets from political officials into whether or not they are partisan tweets or neutral. In the following import statement, we're selecting only the columns that are important, but there may be more useful features in that set. Feel free to explore. 

In [37]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, cross_val_predict, cross_val_score

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, \
HashingVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_union, make_pipeline, Pipeline

In [38]:
import pandas as pd

df = pd.read_csv('datasets/political_media.csv',
                usecols=[7, 20])
df.head()

Unnamed: 0,bias,text
0,partisan,RT @nowthisnews: Rep. Trey Radel (R- #FL) slam...
1,partisan,VIDEO - #Obamacare: Full of Higher Costs and ...
2,neutral,Please join me today in remembering our fallen...
3,neutral,RT @SenatorLeahy: 1st step toward Senate debat...
4,partisan,.@amazon delivery #drones show need to update ...


## Set up

Please split the dataset into a training and test set and convert the `bias` feature into 0s and 1s.

In [39]:
df['bias'].unique()

X = df['text'].values
y = df['bias'].map(lambda x: 1 if x == 'partisan' else 0)

X_train, X_test, y_train, y_test = train_test_split(X, y.values, test_size=0.33)

## Modeling

Please try the following techniques to transform the data. For each technique, do the following:

1. Transform the training data
2. Fit a `RandomForestClassifier` to the transformed training data
3. Transform the test data
4. Discuss the goodness of fit of your model using the test data and a classification report and confusion matrix

### 1. `CountVectorizer()`

In [40]:
cv = CountVectorizer()

In [41]:
Xtrain_cv = cv.fit_transform(X_train)
Xtest_cv = cv.transform(X_test)

rfc = RandomForestClassifier()
rfc.fit(Xtrain_cv, y_train)
print(rfc.score(Xtrain_cv, y_train))
print(confusion_matrix(y_train, rfc.predict(Xtrain_cv)))
print(classification_report(y_train, rfc.predict(Xtrain_cv)))

0.97223880597
[[2474    1]
 [  92  783]]
             precision    recall  f1-score   support

          0       0.96      1.00      0.98      2475
          1       1.00      0.89      0.94       875

avg / total       0.97      0.97      0.97      3350



In [42]:
rfc.score(Xtest_cv, y_test)
print(confusion_matrix(y_test, rfc.predict(Xtest_cv)))
print(classification_report(y_test, rfc.predict(Xtest_cv)))

[[1182   32]
 [ 386   50]]
             precision    recall  f1-score   support

          0       0.75      0.97      0.85      1214
          1       0.61      0.11      0.19       436

avg / total       0.72      0.75      0.68      1650



### 2. `CountVectorizer()` with your choice of `min_df` and `max_df`

In [43]:
cv = CountVectorizer(min_df=0.01, max_df=0.80)
cv.fit(X_train)

X_train_cv = cv.transform(X_train)
X_test_cv = cv.transform(X_test)

rfc = RandomForestClassifier()
rfc.fit(X_train_cv, y_train)
print(rfc.score(X_train_cv, y_train))
print(confusion_matrix(y_train, rfc.predict(X_train_cv)))

0.97671641791
[[2470    5]
 [  73  802]]


In [44]:
print(classification_report(y_train, rfc.predict(X_train_cv)))

             precision    recall  f1-score   support

          0       0.97      1.00      0.98      2475
          1       0.99      0.92      0.95       875

avg / total       0.98      0.98      0.98      3350



In [45]:
print(rfc.score(X_test_cv, y_test))
print(confusion_matrix(y_test, rfc.predict(X_test_cv)))
print(classification_report(y_test, rfc.predict(X_test_cv)))

0.760606060606
[[1157   57]
 [ 338   98]]
             precision    recall  f1-score   support

          0       0.77      0.95      0.85      1214
          1       0.63      0.22      0.33       436

avg / total       0.74      0.76      0.72      1650



### 3. `CountVectorizer()` with English stop words

In [46]:
cv = CountVectorizer(stop_words='english')
cv.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [47]:
X_train_cv = cv.transform(X_train)
X_test_cv = cv.transform(X_test)

rfc = RandomForestClassifier()
rfc.fit(X_train_cv, y_train)
print(rfc.score(X_train_cv, y_train))
print(confusion_matrix(y_train, rfc.predict(X_train_cv)))
print(classification_report(y_train, rfc.predict(X_train_cv)))
print('---------------------------------------------------')
print(rfc.score(X_test_cv, y_test))
print(confusion_matrix(y_test, rfc.predict(X_test_cv)))
print(classification_report(y_test, rfc.predict(X_test_cv)))

0.975820895522
[[2474    1]
 [  80  795]]
             precision    recall  f1-score   support

          0       0.97      1.00      0.98      2475
          1       1.00      0.91      0.95       875

avg / total       0.98      0.98      0.98      3350

---------------------------------------------------
0.73696969697
[[1151   63]
 [ 371   65]]
             precision    recall  f1-score   support

          0       0.76      0.95      0.84      1214
          1       0.51      0.15      0.23       436

avg / total       0.69      0.74      0.68      1650



### 4. `TfidfVectorizer()` 

In [48]:
tv = TfidfVectorizer()
tv.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [49]:
Xtrain_tv = tv.transform(X_train)
Xtest_tv = tv.transform(X_test)

rfc = RandomForestClassifier()
rfc.fit(Xtrain_tv, y_train)
print(rfc.score(Xtest_cv, y_test))
print(confusion_matrix(y_test, rfc.predict(Xtest_cv)))
print(classification_report(y_test, rfc.predict(Xtest_cv)))

0.721818181818
[[1054  160]
 [ 299  137]]
             precision    recall  f1-score   support

          0       0.78      0.87      0.82      1214
          1       0.46      0.31      0.37       436

avg / total       0.70      0.72      0.70      1650



### 5. `TfidfVectorizer()` with English stop words

In [50]:
tv = TfidfVectorizer(stop_words='english', min_df=0.01)
tv.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.01,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [51]:
Xtrain_tv = tv.transform(X_train)
Xtest_tv = tv.transform(X_test)

rf = RandomForestClassifier()
rf.fit(Xtrain_tv, y_train)
print(rf.score(Xtest_tv, y_test))
print(confusion_matrix(y_test, rf.predict(Xtest_tv)))
print(classification_report(y_test, rf.predict(Xtest_tv)))

0.741818181818
[[1130   84]
 [ 342   94]]
             precision    recall  f1-score   support

          0       0.77      0.93      0.84      1214
          1       0.53      0.22      0.31       436

avg / total       0.70      0.74      0.70      1650



### Moving forward

With the remainder of your time, please try and find the best model and data transformation to predict partisan tweets. This is a challenging data set and can be approached from a number of ways.

Some techniques to try are:

1. Different types of data transformation 
2. Custom preprocessors for `CountVectorizer`
3. Custom stopword lists
4. Use of a dimensionality reduction technique (like `TruncatedSVD`)
5. Optimizing hyperparameters using `GridSearchCV`
6. Trying a different modeling technique such as `KNeighborsClassifier` or `LogisticRegression`

In [52]:
df['bias'].unique()

X = df['text'].values
y = df['bias'].map(lambda x: 1 if x == 'partisan' else 0)

X_train, X_test, y_train, y_test = train_test_split(X, y.values, test_size=0.33)

In [53]:
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.decomposition import TruncatedSVD

In [54]:
cv = CountVectorizer(stop_words='english')
cv.fit(X_train)
Xtrain_cv = cv.transform(X_train)
tsvd = TruncatedSVD()

Xtrain_tsvd = tsvd.fit_transform(Xtrain_cv)
rfc = RandomForestClassifier()
rfc.fit(Xtrain_tsvd, y_train)
print(rfc.score(Xtrain_tsvd, y_train))
print(confusion_matrix(y_train, rfc.predict(Xtrain_tsvd)))
print(classification_report(y_train, rfc.predict(Xtrain_tsvd)))
print('\n')
print('_______________________________________________________')
print('Testing data')

Xtest_cv = cv.transform(X_test)
Xtest_tsvd = tsvd.transform(Xtest_cv)
print(rfc.score(Xtest_tsvd, y_test))
print(confusion_matrix(y_test, rfc.predict(Xtest_tsvd)))
print(classification_report(y_test, rfc.predict(Xtest_tsvd)))

0.962089552239
[[2442   14]
 [ 113  781]]
             precision    recall  f1-score   support

          0       0.96      0.99      0.97      2456
          1       0.98      0.87      0.92       894

avg / total       0.96      0.96      0.96      3350



_______________________________________________________
Testing data
0.700606060606
[[1084  149]
 [ 345   72]]
             precision    recall  f1-score   support

          0       0.76      0.88      0.81      1233
          1       0.33      0.17      0.23       417

avg / total       0.65      0.70      0.67      1650



In [55]:
# cleaner function
from nltk.stem import PorterStemmer
import string
from nltk.corpus import stopwords
def cleaner(text):
    stemmer = PorterStemmer()
    stop = stopwords.words('english')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.translate(str.maketrans('', '', string.digits))
    text = text.lower().strip()
    final_text = []
    for w in text.split():
        if w not in stop:
            final_text.append(stemmer.stem(w.strip()))
    return ' '.join(final_text)

In [56]:
cv = CountVectorizer(preprocessor=cleaner)
cv.fit(X_train)
Xtrain_cv = cv.transform(X_train)
tsvd = TruncatedSVD(n_components=500)

Xtrain_tsvd = tsvd.fit_transform(Xtrain_cv)
rfc = RandomForestClassifier()
rfc.fit(Xtrain_tsvd, y_train)
print(rfc.score(Xtrain_tsvd, y_train))
print(confusion_matrix(y_train, rfc.predict(Xtrain_tsvd)))
print(classification_report(y_train, rfc.predict(Xtrain_tsvd)))
print('\n')
print('_______________________________________________________')
print('Testing data')

Xtest_cv = cv.transform(X_test)
Xtest_tsvd = tsvd.transform(Xtest_cv)
print(rfc.score(Xtest_tsvd, y_test))
print(confusion_matrix(y_test, rfc.predict(Xtest_tsvd)))
print(classification_report(y_test, rfc.predict(Xtest_tsvd)))

0.977313432836
[[2454    2]
 [  74  820]]
             precision    recall  f1-score   support

          0       0.97      1.00      0.98      2456
          1       1.00      0.92      0.96       894

avg / total       0.98      0.98      0.98      3350



_______________________________________________________
Testing data
0.733333333333
[[1175   58]
 [ 382   35]]
             precision    recall  f1-score   support

          0       0.75      0.95      0.84      1233
          1       0.38      0.08      0.14       417

avg / total       0.66      0.73      0.66      1650



In [58]:
X_train.shape

(3350,)

In [59]:
tf = TfidfVectorizer(preprocessor=cleaner)
tf.fit(X_train)
politics = pd.DataFrame(tf.transform(X_train).todense(), columns=tf.get_feature_names())
X_train.shape

(3350,)

In [60]:
politics.sum().sort_values(ascending=False).head(10)

today       58.773242
űş          46.562544
amp         41.926708
great       40.306478
hous        38.012409
thank       36.470600
job         35.953632
work        35.094587
american    34.742684
us          34.397409
dtype: float64

In [61]:
custom_words = ['today', 'amp', 'great', 'hous', 'thank', 'job', 'work', 'us', 'american']
stop = stopwords.words('english')
stop.extend(custom_words)

In [62]:
cv = CountVectorizer(preprocessor=cleaner, stop_words=stop)
cv.fit(X_train)
Xtrain_cv = cv.transform(X_train)
tsvd = TruncatedSVD(n_components=400)

Xtrain_tsvd = tsvd.fit_transform(Xtrain_cv)
rfc = RandomForestClassifier()
rfc.fit(Xtrain_tsvd, y_train)
print(rfc.score(Xtrain_tsvd, y_train))
print(confusion_matrix(y_train, rfc.predict(Xtrain_tsvd)))
print(classification_report(y_train, rfc.predict(Xtrain_tsvd)))
print('\n')
print('_______________________________________________________')
print('Testing data')

Xtest_cv = cv.transform(X_test)
Xtest_tsvd = tsvd.transform(Xtest_cv)
print(rfc.score(Xtest_tsvd, y_test))
print(confusion_matrix(y_test, rfc.predict(Xtest_tsvd)))
print(classification_report(y_test, rfc.predict(Xtest_tsvd)))

0.980298507463
[[2453    3]
 [  63  831]]
             precision    recall  f1-score   support

          0       0.97      1.00      0.99      2456
          1       1.00      0.93      0.96       894

avg / total       0.98      0.98      0.98      3350



_______________________________________________________
Testing data
0.739393939394
[[1164   69]
 [ 361   56]]
             precision    recall  f1-score   support

          0       0.76      0.94      0.84      1233
          1       0.45      0.13      0.21       417

avg / total       0.68      0.74      0.68      1650



# Trying now with Gridsearch and pipeline steps, with 3 classifier models

### the last hyperparameters didn't work as well and didn't provide good recall on the test data

In [63]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegressionCV

log_pipe = Pipeline([('tfvec', TfidfVectorizer(preprocessor=cleaner, stop_words=stop)),
                     ('tsvd', TruncatedSVD(n_components=400)),
                    ('lg', LogisticRegression(penalty='l1', C=0.5))])




In [64]:
log_pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfvec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function cleane...ty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [65]:
log_pipe.score(X_train, y_train)

0.76507462686567163

In [66]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegressionCV

log_pipe = Pipeline([('tfvec', TfidfVectorizer(preprocessor=cleaner, stop_words=stop)),
                     ('tsvd', TruncatedSVD(n_components=400)),
                    ('lg', KNeighborsClassifier(n_neighbors=3))])

In [67]:
log_pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfvec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function cleane...owski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform'))])

In [68]:
log_pipe.score(X_train,y_train)

0.84268656716417911

In [69]:
log_pipe.score(X_test, y_test)

0.66000000000000003

In [70]:
print(confusion_matrix(y_test, log_pipe.predict(X_test)))

[[945 288]
 [273 144]]


In [71]:
print(classification_report(y_test, log_pipe.predict(X_test)))

             precision    recall  f1-score   support

          0       0.78      0.77      0.77      1233
          1       0.33      0.35      0.34       417

avg / total       0.66      0.66      0.66      1650



In [72]:
from sklearn.tree import DecisionTreeClassifier
cv = CountVectorizer(preprocessor=cleaner, stop_words=stop)
Xtrain_cv = cv.fit_transform(X_train)
tsvd = TruncatedSVD(n_components=500)
Xtrain_tsvd = tsvd.fit_transform(Xtrain_cv)

In [74]:
grid = {'n_estimators':[5,6,7,8,9,10],
    'criterion': ['gini', 'entropy']}

rfc = RandomForestClassifier()
gs = GridSearchCV(rfc,param_grid=grid,n_jobs=-1, verbose=2)

In [75]:
gs.fit(Xtrain_tsvd, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:    4.9s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [5, 6, 7, 8, 9, 10], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [76]:
gsb = gs.best_estimator_

In [77]:
gs.best_params_

{'criterion': 'entropy', 'n_estimators': 6}

In [79]:
Xtest_cv = cv.transform(X_test)
Xtest_tsvd = tsvd.transform(Xtest_cv)

In [80]:
gsb.score(Xtest_tsvd, y_test)

0.74242424242424243