# Modeling - Logistic Regression (after deep data cleaning)

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import spacy

from sklearn.feature_extraction.text import CountVectorizer,  TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import plot_confusion_matrix, balanced_accuracy_score

In [62]:
#read in data

postings = pd.read_csv('../datasets/postings_cleaned.csv')

In [63]:
postings

Unnamed: 0,topic,text
0,hiking,boulder flatiron loop hike
1,hiking,washington state lakes to swim
2,hiking,here a fun episode demonstrating why a is a fo...
3,hiking,picture i took of my friend at angel s landing...
4,hiking,hiking to bertha peak via cougar crest trail a...
...,...,...
11852,gardening,nope haha oklahoma so not too far away
11853,gardening,i transitioning to everbearing in strawberry p...
11854,gardening,so pretty
11855,gardening,i think it s chance of frost after that date


In [64]:
#define X and y

X = postings['text']
y = np.where(postings['topic'] == 'hiking',1,0)

In [65]:
#checking X

X

0                               boulder flatiron loop hike
1                           washington state lakes to swim
2        here a fun episode demonstrating why a is a fo...
3        picture i took of my friend at angel s landing...
4        hiking to bertha peak via cougar crest trail a...
                               ...                        
11852               nope haha oklahoma so not too far away
11853    i transitioning to everbearing in strawberry p...
11854                                            so pretty
11855         i think it s chance of frost after that date
11856    my understanding is that the caterpillars eat ...
Name: text, Length: 11857, dtype: object

In [66]:
#checking y

y

array([1, 1, 1, ..., 0, 0, 0])

### Train, test, split

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, random_state=123)

### Creating a null model

In [68]:
pd.DataFrame(y).value_counts(normalize=True)

1    0.555031
0    0.444969
dtype: float64

In [None]:
# I have to create a model that does bettern than the 56% rate of the null model
# This didn't change much after deep cleaning

In [69]:
def input_to_df(model, cleaning, lemmatized):
    """
    Appends model information into model summary dataframe for comparison
    """
    df = pd.read_csv('../datasets/model_list_summary.csv')
    
    if lemmatized == 'no':
        info = [cleaning,          #Indicates if data was minimally or deeply cleaned
                lemmatized,        #Indicates if text was lemmatized or not
                model.score(X_train,y_train), 
                model.score(X_test, y_test), 
                balanced_accuracy_score(y_train, model.predict(X_train)),
                balanced_accuracy_score(y_test, model.predict(X_test)),
                model.best_params_
               ]
    elif lemmatized == 'yes': 
        info = [cleaning,          #Indicates if data was minimally or deeply cleaned
                lemmatized,        #Indicates if text was lemmatized or not
                model.score(X_train_lem,y_train), 
                model.score(X_test_lem, y_test), 
                balanced_accuracy_score(y_train, model.predict(X_train_lem)),
                balanced_accuracy_score(y_test, model.predict(X_test_lem)),
                model.best_params_
               ]
    info_series = pd.Series(info, index=df.columns)
    df = df.append(info_series, ignore_index=True)
    df.to_csv('../datasets/model_list_summary.csv', index=False)

### Building my pipe and GS

In [70]:
def model_fit_and_scores(model):
    model.fit(X_train,y_train)
    print(f' accuracy score on train data:  {model.score(X_train,y_train)}')
    print(f' accuracy score on test data: {model.score(X_test, y_test)}')
    print(f' balanced accuracy score on train data: {balanced_accuracy_score(y_train, model.predict(X_train))}')
    print(f' balanced accuracy score on test data: {balanced_accuracy_score(y_test, model.predict(X_test))}')
    print(model.best_params_)

In [15]:
pipe5 = make_pipeline(CountVectorizer(),
                     StandardScaler(with_mean=False),
                     LogisticRegression())
params5 = { 
          'countvectorizer__stop_words': [None, 'english'], 
          'countvectorizer__max_features':[4000,5000],
          'countvectorizer__ngram_range':[(1,1), (1,2), (2,2)],
          'logisticregression__C':[.001, .01, .1]
}

gs5 = GridSearchCV(pipe5,params5, n_jobs = -1)

In [16]:
model_fit_and_scores(gs5)

 accuracy score on train data:  0.9613135402609086
 accuracy score on test data: 0.8978077571669477
 balanced accuracy score on train data: 0.9593373445388995
 balanced accuracy score on test data: 0.8948550348813537
{'countvectorizer__max_features': 5000, 'countvectorizer__ngram_range': (1, 1), 'countvectorizer__stop_words': 'english', 'logisticregression__C': 0.001}


In [71]:
pipe6 = make_pipeline(CountVectorizer(),
                     StandardScaler(with_mean=False),
                     LogisticRegression())
params6 = { 
          'countvectorizer__stop_words': ['english'], 
          'countvectorizer__max_features':[4500,5000,5500],
          'countvectorizer__ngram_range':[(1,1), (1,2)],
          'logisticregression__C':[.0008, .001, .002]
}

gs6 = GridSearchCV(pipe6,params6, n_jobs = -1)

In [72]:
model_fit_and_scores(gs6)

 accuracy score on train data:  0.9560278902384165
 accuracy score on test data: 0.897133220910624
 balanced accuracy score on train data: 0.9536739555080336
 balanced accuracy score on test data: 0.8933438012707076
{'countvectorizer__max_features': 4500, 'countvectorizer__ngram_range': (1, 1), 'countvectorizer__stop_words': 'english', 'logisticregression__C': 0.0008}


In [73]:
# Since this is the best so far, I will input this into my model summary

input_to_df(gs6, 'deep', 'no')

### Lemmatized did better than non-lemmatized

In [84]:
# Created a new function for lemmatized data

def model_lem_fit_and_scores(model):
    model.fit(X_train_lem,y_train)
    print(f' Score on train data:  {model.score(X_train_lem,y_train)}')
    print(f' Score on test data: {model.score(X_test_lem, y_test)}')
    print(f' balanced accuracy score on train data: {balanced_accuracy_score(y_train, model.predict(X_train_lem))}')
    print(f' balanced accuracy score on test data: {balanced_accuracy_score(y_test, model.predict(X_test_lem))}')
    print(model.best_params_)

In [4]:
nlp = spacy.load('en_core_web_sm')
def lemmatized(sentence):
    doc = nlp(sentence)
    lemma_list = [token.lemma_ for token in doc]
    lemma_sentence = ' '.join(lemma_list)
    return lemma_sentence

In [76]:
X_train_lem = X_train.apply(lemmatized)

In [77]:
X_test_lem = X_test.apply(lemmatized)

In [29]:
pipe2 = make_pipeline(CountVectorizer(),
                     StandardScaler(with_mean=False),
                     LogisticRegression(max_iter=10000))
params2 = { 
          'countvectorizer__stop_words': [None, 'english'], 
          'countvectorizer__max_features':[3000,4000],
          'countvectorizer__ngram_range':[(1,1), (1,2)],
          'logisticregression__C':[.005, .01]
}

gs2 = GridSearchCV(pipe2, params2, n_jobs = -1)

In [30]:
model_lem_fit_and_scores(gs2)

 Score on train data:  0.9692982456140351
 Score on test data: 0.9048903878583474
 balanced accuracy score on train data: 0.9685591998482164
 balanced accuracy score on test data: 0.9037946196214408
{'countvectorizer__max_features': 4000, 'countvectorizer__ngram_range': (1, 1), 'countvectorizer__stop_words': None, 'logisticregression__C': 0.005}


In [31]:
pipe3 = make_pipeline(CountVectorizer(),
                     StandardScaler(with_mean=False),
                     LogisticRegression(max_iter=10000))
params3 = { 
          'countvectorizer__stop_words': [None, 'english'], 
          'countvectorizer__max_features':[4000,5000],
          'countvectorizer__ngram_range':[(1,1), (1,2)],
          'logisticregression__C':[.003, .005, .007]
}

gs3 = GridSearchCV(pipe3, params3, n_jobs = -1)

In [32]:
model_lem_fit_and_scores(gs3)

 Score on train data:  0.9728969860548808
 Score on test data: 0.9042158516020236
 balanced accuracy score on train data: 0.9720016776087623
 balanced accuracy score on test data: 0.9023586943604871
{'countvectorizer__max_features': 5000, 'countvectorizer__ngram_range': (1, 1), 'countvectorizer__stop_words': None, 'logisticregression__C': 0.003}


In [33]:
pipe4 = make_pipeline(CountVectorizer(),
                     StandardScaler(with_mean=False),
                     LogisticRegression(max_iter=10000))
params4 = { 
          'countvectorizer__stop_words': [None, 'english'], 
          'countvectorizer__max_features':[4500,5000],
          'countvectorizer__ngram_range':[(1,1)],
          'logisticregression__C':[.001, .003]
}

gs4 = GridSearchCV(pipe4, params4, n_jobs = -1)

In [34]:
model_lem_fit_and_scores(gs4)

 Score on train data:  0.9691857849752586
 Score on test data: 0.9042158516020236
 balanced accuracy score on train data: 0.9681824291989956
 balanced accuracy score on test data: 0.9022833860107946
{'countvectorizer__max_features': 4500, 'countvectorizer__ngram_range': (1, 1), 'countvectorizer__stop_words': None, 'logisticregression__C': 0.003}


In [86]:
pipe10 = make_pipeline(CountVectorizer(),
                     StandardScaler(with_mean=False),
                     LogisticRegression(max_iter=10000))
params10 = { 
          'countvectorizer__stop_words': [None, 'english'], 
          'countvectorizer__max_features':[4000,4500],
          'countvectorizer__ngram_range':[(1,1)],
          'logisticregression__C':[.004, .003, .005]
}

gs10 = GridSearchCV(pipe10, params10, n_jobs = -1)

In [87]:
model_lem_fit_and_scores(gs10)

 Score on train data:  0.9691857849752586
 Score on test data: 0.9042158516020236
 balanced accuracy score on train data: 0.9681824291989956
 balanced accuracy score on test data: 0.9022833860107946
{'countvectorizer__max_features': 4500, 'countvectorizer__ngram_range': (1, 1), 'countvectorizer__stop_words': None, 'logisticregression__C': 0.003}


In [80]:
# Since this is the best so far with lemmatized text, I will input this into my model summary

input_to_df(gs10, 'deep', 'yes')

### ^^ Best so far!

### TfidfVectorizer didn't do better than CountVectorizer

In [81]:
pipe20 = make_pipeline(TfidfVectorizer(),
                     StandardScaler(with_mean=False),
                    LogisticRegression(max_iter=10000))
params20 = { 
          'tfidfvectorizer__stop_words': [None, 'english'], 
          'tfidfvectorizer__max_features':[4000,5000],
          'tfidfvectorizer__ngram_range':[(1,1),(1,2)],
          'logisticregression__C':[.003, .005]
}

gs20 = GridSearchCV(pipe20, params20, n_jobs = -1)

In [82]:
model_lem_fit_and_scores(gs20)

 Score on train data:  0.9742465137201979
 Score on test data: 0.8984822934232715
 balanced accuracy score on train data: 0.9732425243095802
 balanced accuracy score on test data: 0.896592193541077
{'logisticregression__C': 0.003, 'tfidfvectorizer__max_features': 5000, 'tfidfvectorizer__ngram_range': (1, 1), 'tfidfvectorizer__stop_words': 'english'}


In [58]:
pipe21 = make_pipeline(TfidfVectorizer(),
                     StandardScaler(with_mean=False),
                    LogisticRegression(max_iter=10000))
params21 = { 
          'tfidfvectorizer__stop_words': [None, 'english'], 
          'tfidfvectorizer__max_features':[5500,5200],
          'tfidfvectorizer__ngram_range':[(1,1)],
          'logisticregression__C':[.0005, .001, .003]
}

gs21 = GridSearchCV(pipe21, params21, n_jobs = -1)

In [59]:
model_lem_fit_and_scores(gs21)

 Score on train data:  0.9665991902834008
 Score on test data: 0.896795952782462
 balanced accuracy score on train data: 0.9648004037322186
 balanced accuracy score on test data: 0.8933412679623081
{'logisticregression__C': 0.0005, 'tfidfvectorizer__max_features': 5500, 'tfidfvectorizer__ngram_range': (1, 1), 'tfidfvectorizer__stop_words': 'english'}


In [83]:
# I will save this into summary to show tfidfvectorizer did slightly worse this time than countvectorizer

input_to_df(gs20, 'deep', 'yes')

In [1]:
#Checking highest and lowest correlated words...

In [88]:
best = gs10.best_estimator_

In [89]:
coefs = best.named_steps['logisticregression'].coef_
vocab = best.named_steps['countvectorizer'].get_feature_names()

In [90]:
coef_df = pd.DataFrame({'coefs': coefs[0], 'word': vocab})

In [41]:
coef_df.head()

Unnamed: 0,coefs,word
0,0.019826,ab
1,0.0157,abandon
2,-0.001103,abbey
3,0.003283,ability
4,0.00528,able


In [91]:
coef_df.nlargest(10,'coefs')

Unnamed: 0,coefs,word
1756,0.445701,hike
4041,0.255881,trail
1758,0.223324,hiking
2485,0.185376,mountain
2806,0.183078,park
860,0.155665,colorado
2072,0.140908,lake
4163,0.134003,usa
4229,0.129965,view
501,0.129964,boot


In [92]:
coef_df.nsmallest(10,'coefs')

Unnamed: 0,coefs,word
2941,-0.285682,plant
1530,-0.260657,garden
1632,-0.181512,grow
1445,-0.157635,flower
2518,-0.14445,my
4007,-0.143598,tomato
4093,-0.142555,tulip
465,-0.123279,bloom
1533,-0.11988,gardening
4055,-0.11867,tree


- Checking that my lemmatization worked by comparing X_train and X_train_lem
- I found out that spacy lemmatizes certain words like "hiking" based on what part of speech they are. For example, verbs were lemmatized from "hiking" to "hike" but if "hiking" was used as an adjective or a noun, they were not lemmatized. Hence, "hiking" was still present in some texts after lemmatization.

In [5]:
#It does lemmatize "hiking" because it's a verb

lemmatized('are you going hiking gardening running')

'be you go hike gardening run'

In [6]:
#It does not lemmatize "hiking" because it's an adjective
lemmatized('i cannot bring my hiking gear tomorrow')

'I can not bring my hiking gear tomorrow'

In [45]:
X_train[3296]

'help finding a good hiking backpack'

In [46]:
X_train_lem[3296]

'help find a good hiking backpack'

In [None]:
# Change in count below shows that hundreds of "hiking" were indeed lemmatized.

In [48]:
len([text for text in X_train if "hiking" in text])

593

In [49]:
len([text for text in X_train_lem if "hiking" in text])

217