# Texas Classification Using TF-IDF

## Here we will use a simple text dataset consisting of senteces about Monty Python and describing ice cream flavors.
### The unit of observation (*documents*) will be the sentences of these topics.

In [92]:
import numpy as np
import pandas as pd
import sklearn
import spacy
import re
import en_core_web_sm
nlp = en_core_web_sm.load()
from nltk.corpus import gutenberg
import nltk
import warnings
from sklearn import (datasets, model_selection, feature_extraction, linear_model, naive_bayes, ensemble)
import collections
from collections import Counter
import nltk
import re
import multiprocessing as mp 
import textacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
#!python -m spacy download en
warnings.filterwarnings("ignore")
nltk.download('gutenberg')


[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [2]:
# Import data
path = "D:\\Dropbox\\dev_data\\re\\pro\\tf_idf_data.csv"
df0 = pd.DataFrame()
df0 = pd.read_csv(path)

In [3]:
df0.head()

Unnamed: 0,text,label
0,"""The best Monty Python sketch is the one about...",Monty Python
1,"""I laugh when I think about Python's Ministry...",Monty Python
2,"""Chocolate is the best ice cream dessert topp...",Ice Cream
3,"""The Lumberjack Song is the funniest Monty Py...",Monty Python
4,"""I would rather put strawberries on my ice cr...",Ice Cream


### A helper function for removing some punctuation marks and numbers from the text:

In [4]:
############################################# DO NOT DELETE ############################################# 
# Function to move specific column to the left side for easier view
def move_to_left(df, column_name):
    df= df[ [str(column_name)] + [ col for col in df.columns if col != str(column_name) ] ]
    return df

In [48]:
# Utility function for standard text cleaning
def text_cleaner(text):
    text = re.sub(r'"','',text)
    text = re.sub("[\[].*?[\]];", "", text)
    text = re.sub(r"(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b", " ", text)
    text = ' '.join(text.split())
    return text

In [49]:
df0['cleaned'] = df0['text'].apply(text_cleaner)

In [57]:
df1 = df0.copy()

In [60]:
df1.head(2)

Unnamed: 0,text,label,cleaned
0,"""The best Monty Python sketch is the one about...",Monty Python,good monty python sketch dead parrot laugh hard
1,"""I laugh when I think about Python's Ministry...",Monty Python,laugh think python ministry silly walk sketch ...


In [40]:
#df1['cleaned'] = [','.join(ele.split()) for ele in df1['cleaned']]

In [59]:
df1['cleaned'] = df1['cleaned'].str.replace("[^\w\s]", "").str.lower()

df1['cleaned'] = df1['cleaned'].apply(lambda text: 
                                          " ".join(token.lemma_ for token in nlp(text) 
                                                   if not token.is_stop))

In [61]:
df1.head(2)

Unnamed: 0,text,label,cleaned
0,"""The best Monty Python sketch is the one about...",Monty Python,good monty python sketch dead parrot laugh hard
1,"""I laugh when I think about Python's Ministry...",Monty Python,laugh think python ministry silly walk sketch ...


# Prepare Vertorization using 1-gram and 2-grams

In [93]:
# 1-gram
vectorizer = TfidfVectorizer(
    max_df=0.5, min_df=1, use_idf=True, norm=u'l2', smooth_idf=True, ngram_range=(1,1))

# applying the vectorizer
X = vectorizer.fit_transform(df1["cleaned"])

tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
df_1gram = pd.concat([tfidf_df, df1[[ "label"]]], axis=1)


In [95]:
# 2-grams
vectorizer = TfidfVectorizer(
    max_df=0.5, min_df=1, use_idf=True, norm=u'l2', smooth_idf=True, ngram_range=(1,2))

# applying the vectorizer
X = vectorizer.fit_transform(df1["cleaned"])

tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
df_2gram = pd.concat([tfidf_df, df1[["label"]]], axis=1)


In [96]:
df_1gram.head()

Unnamed: 0,accompaniment,bit,caramel,chocolate,cream,dead,dessert,fantastic,funniest,funny,...,silly,sketch,song,strawberry,taste,tasty,think,top,walk,label
0,0.0,0.0,0.0,0.0,0.0,0.434231,0.0,0.0,0.0,0.0,...,0.0,0.356076,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Monty Python
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.793233,...,0.264411,0.216821,0.0,0.0,0.0,0.0,0.216821,0.0,0.264411,Monty Python
2,0.0,0.0,0.0,0.44236,0.306252,0.0,0.362742,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.306252,0.0,0.0,0.44236,0.0,Ice Cream
3,0.0,0.370029,0.0,0.0,0.0,0.0,0.0,0.0,0.370029,0.0,...,0.0,0.0,0.370029,0.0,0.0,0.0,0.303429,0.0,0.0,Monty Python
4,0.0,0.0,0.0,0.0,0.392555,0.0,0.464964,0.0,0.0,0.0,...,0.0,0.0,0.0,0.567019,0.392555,0.0,0.0,0.0,0.0,Ice Cream


In [97]:
df_1gram.shape, df_2gram.shape

((6, 31), (6, 70))

## Use multiple machine learning algorithms on 1-gram:

In [103]:
Y = df_1gram['label']
X = np.array(df_1gram.drop(['label'], 1))
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=123)

In [104]:
# Models
lr_params = {"penalty": ["l2"]}
lr = LogisticRegression()

rfc_params = {"n_estimators": [3, 5, 10, 15], "max_depth": [2, 3, 4, 5], "min_samples_split": [3, 5, 7, 9]}
rfc = RandomForestClassifier()

gbc_params = {"n_estimators": [3, 5, 10, 15],"max_depth": [2, 3, 4, 5], "min_samples_split": [3, 5, 7, 9]}
gbc = GradientBoostingClassifier()

SDG_params= {'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e3], 'penalty': ['l2'], 'n_jobs': [-1]}
SDG = linear_model.SGDClassifier()

NB_Multi_params = {'alpha': [1.0]}
NB_Multi= naive_bayes.MultinomialNB()

NB_Bern_params = {'alpha': [1.0]}
NB_Bern = naive_bayes.BernoulliNB()

In [105]:
# GridsearchCV
clf_lr = GridSearchCV(lr, lr_params, cv=2)
clf_lr.fit(X_train, y_train)

clf_rfc = GridSearchCV(rfc, rfc_params, cv=2)
clf_rfc.fit(X_train, y_train)

clf_gbc = GridSearchCV(gbc, gbc_params, cv=2)
clf_gbc.fit(X_train, y_train)

clf_SDG = GridSearchCV(SDG, SDG_params, cv=2)
clf_SDG.fit(X_train, y_train)

clf_NB_Multi = GridSearchCV(NB_Multi, NB_Multi_params, cv=2)
clf_NB_Multi.fit(X_train, y_train)

clf_NB_Bern = GridSearchCV(NB_Bern, NB_Bern_params, cv=2)
clf_NB_Bern.fit(X_train, y_train)

GridSearchCV(cv=2, estimator=BernoulliNB(), param_grid={'alpha': [1.0]})

In [106]:
# Fit the models
lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)
SDG.fit(X_train, y_train)
NB_Multi.fit(X_train, y_train)
NB_Bern.fit(X_train, y_train)

BernoulliNB()

In [107]:
# Check some basic performance
print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train, y_train))
print('\nTest set score:', gbc.score(X_test, y_test))

print("----------------------Stochastic Gradient Descent Scores----------------------")
print('Training set score:', SDG.score(X_train, y_train))
print('\nTest set score:', SDG.score(X_test, y_test))

print("---------------------- Naive Bayes Multinominal Scores----------------------")
print('Training set score:', NB_Multi.score(X_train, y_train))
print('\nTest set score:', NB_Multi.score(X_test, y_test))

print("----------------------Naive Bayes Bernoulli Scores----------------------")
print('Training set score:', NB_Bern.score(X_train, y_train))
print('\nTest set score:', NB_Bern.score(X_test, y_test))

----------------------Logistic Regression Scores----------------------
Training set score: 0.6666666666666666

Test set score: 0.3333333333333333
----------------------Random Forest Scores----------------------
Training set score: 1.0

Test set score: 1.0
----------------------Gradient Boosting Scores----------------------
Training set score: 1.0

Test set score: 1.0
----------------------Stochastic Gradient Descent Scores----------------------
Training set score: 1.0

Test set score: 1.0
---------------------- Naive Bayes Multinominal Scores----------------------
Training set score: 1.0

Test set score: 0.3333333333333333
----------------------Naive Bayes Bernoulli Scores----------------------
Training set score: 1.0

Test set score: 1.0


## Use multiple machine learning algorithms on 2-grama:

In [108]:
Y = df_2gram['label']
X = np.array(df_2gram.drop(['label'], 1))
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=123)

In [109]:
# Models
lr_params = {"penalty": ["l2"]}
lr = LogisticRegression()

rfc_params = {"n_estimators": [3, 5, 10, 15], "max_depth": [2, 3, 4, 5], "min_samples_split": [3, 5, 7, 9]}
rfc = RandomForestClassifier()

gbc_params = {"n_estimators": [3, 5, 10, 15],"max_depth": [2, 3, 4, 5], "min_samples_split": [3, 5, 7, 9]}
gbc = GradientBoostingClassifier()

SDG_params= {'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e3], 'penalty': ['l2'], 'n_jobs': [-1]}
SDG = linear_model.SGDClassifier()

NB_Multi_params = {'alpha': [1.0]}
NB_Multi= naive_bayes.MultinomialNB()

NB_Bern_params = {'alpha': [1.0]}
NB_Bern = naive_bayes.BernoulliNB()

In [111]:
# GridsearchCV
clf_lr = GridSearchCV(lr, lr_params, cv=2)
clf_lr.fit(X_train, y_train)

clf_rfc = GridSearchCV(rfc, rfc_params, cv=2)
clf_rfc.fit(X_train, y_train)

clf_gbc = GridSearchCV(gbc, gbc_params, cv=2)
clf_gbc.fit(X_train, y_train)

clf_SDG = GridSearchCV(SDG, SDG_params, cv=2)
clf_SDG.fit(X_train, y_train)

clf_NB_Multi = GridSearchCV(NB_Multi, NB_Multi_params, cv=2)
clf_NB_Multi.fit(X_train, y_train)

clf_NB_Bern = GridSearchCV(NB_Bern, NB_Bern_params, cv=2)
clf_NB_Bern.fit(X_train, y_train)

GridSearchCV(cv=2, estimator=BernoulliNB(), param_grid={'alpha': [1.0]})

In [112]:
# Fit the models
lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)
SDG.fit(X_train, y_train)
NB_Multi.fit(X_train, y_train)
NB_Bern.fit(X_train, y_train)

BernoulliNB()

In [113]:
# Check some basic performance
print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train, y_train))
print('\nTest set score:', gbc.score(X_test, y_test))

print("----------------------Stochastic Gradient Descent Scores----------------------")
print('Training set score:', SDG.score(X_train, y_train))
print('\nTest set score:', SDG.score(X_test, y_test))

print("---------------------- Naive Bayes Multinominal Scores----------------------")
print('Training set score:', NB_Multi.score(X_train, y_train))
print('\nTest set score:', NB_Multi.score(X_test, y_test))

print("----------------------Naive Bayes Bernoulli Scores----------------------")
print('Training set score:', NB_Bern.score(X_train, y_train))
print('\nTest set score:', NB_Bern.score(X_test, y_test))

----------------------Logistic Regression Scores----------------------
Training set score: 0.6666666666666666

Test set score: 0.3333333333333333
----------------------Random Forest Scores----------------------
Training set score: 1.0

Test set score: 0.6666666666666666
----------------------Gradient Boosting Scores----------------------
Training set score: 1.0

Test set score: 0.3333333333333333
----------------------Stochastic Gradient Descent Scores----------------------
Training set score: 1.0

Test set score: 1.0
---------------------- Naive Bayes Multinominal Scores----------------------
Training set score: 1.0

Test set score: 0.3333333333333333
----------------------Naive Bayes Bernoulli Scores----------------------
Training set score: 1.0

Test set score: 1.0


# Conclustion

Adding 2-grams did not improve the result. However, because the dateset is very small we are facing overfitting problems.