In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS

from nltk.corpus import stopwords

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt
%matplotlib inline

### In this notebook I gridsearched tfidf and Logistic regression for the the data cleaned in 3 different ways. I found that the corpus that wasn't stripped of special characters gave the best score (df3)

In [9]:
df = pd.read_csv('data/ascii_clean.csv') 

In [10]:
X = df['text']
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.2,
                                                    random_state=42)

In [11]:
english_stop = list(ENGLISH_STOP_WORDS)

In [12]:
pipe_tvec = Pipeline(
    [('tf', TfidfVectorizer()),
     ('lr', LogisticRegression())]
    ) 

pipe_params = {'tf__max_features': [None],
               'tf__strip_accents': [None, 'ascii'], 
               'tf__stop_words': [stopwords.words("english"), english_stop],
               'tf__ngram_range': [(1,1), (1,2)],
               'tf__min_df': [0.005, 1],
             
               'lr__solver': ['saga'],
               'lr__penalty': ['l2'],
               'lr__n_jobs': [-1]
               }

gs = GridSearchCV(pipe_tvec,
                  param_grid=pipe_params, 
                  cv=5,
                  verbose=1) 

In [14]:
gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  8.9min finished


0.721375
{'lr__n_jobs': -1, 'lr__penalty': 'l2', 'lr__solver': 'saga', 'tf__max_features': None, 'tf__min_df': 1, 'tf__ngram_range': (1, 1), 'tf__stop_words': ['who', 'themselves', 'namely', 'only', 'nobody', 'least', 'in', 'thus', 'we', 'nor', 'thereupon', 'out', 'de', 'that', 'cannot', 'anywhere', 'their', 'always', 'ever', 'from', 'system', 'thick', 'wherever', 'besides', 'amongst', 'either', 'sometime', 'yourselves', 'whereas', 'forty', 'move', 'indeed', 'until', 'afterwards', 'a', 'cant', 'inc', 'others', 'latter', 'hundred', 'had', 'full', 'on', 'bottom', 'could', 'throughout', 'together', 'anything', 'is', 'anyhow', 'then', 'ltd', 'nowhere', 'do', 'why', 'neither', 'ten', 'would', 'myself', 'some', 'below', 'per', 'one', 'perhaps', 'be', 'made', 'without', 'they', 'upon', 'something', 'even', 'were', 'though', 'except', 'toward', 'somehow', 'see', 'sometimes', 'much', 'but', 'cry', 'above', 'with', 'someone', 'beforehand', 'whenever', 'when', 'whole', 'etc', 'them', 'fifteen', '

In [19]:
df2 = pd.read_csv('rg_clean.csv')

In [20]:
X = df2['text']
y = df2['target_col']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.2,
                                                    random_state=42)

In [21]:
pipe_tvec = Pipeline(
    [('tf', TfidfVectorizer()),
     ('lr', LogisticRegression())]
    ) 

pipe_params = {'tf__max_features': [None],
               'tf__strip_accents': [None, 'ascii'], 
               'tf__stop_words': [stopwords.words("english"), english_stop],
               'tf__ngram_range': [(1,1), (1,2)],
               'tf__min_df': [0.005, 1],
             
               'lr__solver': ['saga'],
               'lr__penalty': ['l2'],
               'lr__n_jobs': [-1]
               }

gs = GridSearchCV(pipe_tvec,
                  param_grid=pipe_params, 
                  cv=5,
                  verbose=1) 

In [22]:
gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  8.9min finished


0.723375
{'lr__n_jobs': -1, 'lr__penalty': 'l2', 'lr__solver': 'saga', 'tf__max_features': None, 'tf__min_df': 1, 'tf__ngram_range': (1, 1), 'tf__stop_words': ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 

In [24]:
df3 = pd.read_csv('data/clean_data.csv')

In [25]:
X = df3['text']
y = df3['target_col']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.2,
                                                    random_state=42)

In [17]:
pipe_tvec = Pipeline(
    [('tf', TfidfVectorizer()),
     ('lr', LogisticRegression())]
    ) 

pipe_params = {'tf__max_features': [None],
               'tf__strip_accents': [None, 'ascii'], 
               'tf__stop_words': [stopwords.words("english"), english_stop],
               'tf__ngram_range': [(1,1), (1,2)],
               'tf__min_df': [0.005, 1],
             
               'lr__solver': ['saga'],
               'lr__penalty': ['l2'],
               'lr__n_jobs': [-1]
               }

gs = GridSearchCV(pipe_tvec,
                  param_grid=pipe_params, 
                  cv=5,
                  verbose=1) 

In [18]:
gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  8.9min finished


0.7235
{'lr__n_jobs': -1, 'lr__penalty': 'l2', 'lr__solver': 'saga', 'tf__max_features': None, 'tf__min_df': 1, 'tf__ngram_range': (1, 1), 'tf__stop_words': ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'w

## d3

In [31]:
df3 = pd.read_csv('data/clean_data.csv')

X = df3['text']
y = df3['target_col']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.2,
                                                    random_state=42)

In [32]:
tf = TfidfVectorizer(stop_words=stopwords.words("english"),
                     max_features=None,
                     ngram_range=(1,1),
                     strip_accents='ascii' 
                    )

tf.fit(X_train)
X_train_tf = tf.transform(X_train).todense()
X_test_tf = tf.transform(X_test).todense()

X_train_tf = tf.transform(X_train) #.todense()
X_test_tf = tf.transform(X_test) 

In [33]:
lr = LogisticRegression(solver='saga')

lr.fit(X_train_tf, y_train)
(lr.score(X_train_tf, y_train), lr.score(X_test_tf, y_test))

(0.87275, 0.7385)

# df2

In [34]:
df2 = pd.read_csv('rg_clean.csv')

X = df2['text']
y = df2['target_col']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.2,
                                                    random_state=42)

In [35]:
tf = TfidfVectorizer(stop_words=stopwords.words("english"),
                     max_features=None,
                     ngram_range=(1,1),
                     strip_accents='ascii' 
                    )

tf.fit(X_train)
X_train_tf = tf.transform(X_train).todense()
X_test_tf = tf.transform(X_test).todense()

X_train_tf = tf.transform(X_train) #.todense()
X_test_tf = tf.transform(X_test) 

In [36]:
lr = LogisticRegression(solver='saga')

lr.fit(X_train_tf, y_train)
(lr.score(X_train_tf, y_train), lr.score(X_test_tf, y_test))

(0.87275, 0.7385)

# df

In [40]:
df = pd.read_csv('data/ascii_clean.csv') 

X = df['text']
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.2,
                                                    random_state=42)

In [41]:
tf = TfidfVectorizer(stop_words=stopwords.words("english"),
                     max_features=None,
                     ngram_range=(1,1),
                     strip_accents='ascii' 
                    )

tf.fit(X_train)
X_train_tf = tf.transform(X_train).todense()
X_test_tf = tf.transform(X_test).todense()

X_train_tf = tf.transform(X_train) #.todense()
X_test_tf = tf.transform(X_test) 

In [42]:
lr = LogisticRegression(solver='saga')

lr.fit(X_train_tf, y_train)
(lr.score(X_train_tf, y_train), lr.score(X_test_tf, y_test))

(0.87125, 0.7435)