<a href="https://colab.research.google.com/github/anjalinagel12/NLP/blob/master/text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
# Load the Pandas libraries with alias 'pd' 
import pandas as pd 
# Read data from file 'filename.csv' 
# (in the same directory that your python process is based)
# Control delimiters, rows, column names with read_csv (see later) 
training_data = pd.read_csv("/content/training_data.csv") 
# Preview the first 5 lines of the loaded data 
training_data.drop(['_id', 'id'], axis='columns', inplace=True)
training_data.tail()

Unnamed: 0,text,label
2614,"it's ""true"" or not and that ""truth"" is availab...",0
2615,And these slogans don't even denote any sense ...,0
2616,&gt;whole-bodyWhile,0
2617,that the majority of them are affected negativ...,1
2618,"And just trying to be romantic with women, ass...",0


In [12]:
validate_data = pd.read_csv("/content/validate_data.csv") 
# Preview the first 5 lines of the loaded data 
validate_data.drop(['_id', 'id'], axis='columns', inplace=True)

validate_data.head()

Unnamed: 0,text,label
0,changing the US laws would not change internat...,1
1,There will always be evil people that bring mi...,1
2,Remember reasonable people actually voted Hitl...,0
3,", but did",0
4,", which",0


##1. Preprocessing Dataset


###Remove punctuation


In [13]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [15]:
#Function to remove Punctuation
def remove_punct(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])# It will discard all punctuations
    return text_nopunct

training_data['body_text_clean'] = training_data['text'].apply(lambda x: remove_punct(x))

training_data.head()

Unnamed: 0,text,label,body_text_clean
0,Since communism has been relegated to just a h...,0,Since communism has been relegated to just a h...
1,Can you counter that?,0,Can you counter that
2,Censorship does not eliminate the censored ind...,0,Censorship does not eliminate the censored ind...
3,"Without the extra population from abortions, h...",0,Without the extra population from abortions ha...
4,I can't stand it,1,I cant stand it


###Tokenization

In [16]:
import re

# Function to Tokenize words
def tokenize(text):
    tokens = re.split('\W+', text) #W+ means that either a word character (A-Za-z0-9_) or a dash (-) can go there.
    return tokens

training_data['body_text_tokenized'] = training_data['body_text_clean'].apply(lambda x: tokenize(x.lower())) 
#We convert to lower as Python is case-sensitive. 

training_data.head()

Unnamed: 0,text,label,body_text_clean,body_text_tokenized
0,Since communism has been relegated to just a h...,0,Since communism has been relegated to just a h...,"[since, communism, has, been, relegated, to, j..."
1,Can you counter that?,0,Can you counter that,"[can, you, counter, that]"
2,Censorship does not eliminate the censored ind...,0,Censorship does not eliminate the censored ind...,"[censorship, does, not, eliminate, the, censor..."
3,"Without the extra population from abortions, h...",0,Without the extra population from abortions ha...,"[without, the, extra, population, from, aborti..."
4,I can't stand it,1,I cant stand it,"[i, cant, stand, it]"


###Remove stopwords


In [20]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [23]:
stopword = nltk.corpus.stopwords.words('english')# All English Stopwords


In [25]:
# Function to remove Stopwords
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopword]# To remove all stopwords
    return text

training_data['body_text_nostop'] = training_data['body_text_tokenized'].apply(lambda x: remove_stopwords(x))
training_data.head()

Unnamed: 0,text,label,body_text_clean,body_text_tokenized,body_text_nostop
0,Since communism has been relegated to just a h...,0,Since communism has been relegated to just a h...,"[since, communism, has, been, relegated, to, j...","[since, communism, relegated, handful, countries]"
1,Can you counter that?,0,Can you counter that,"[can, you, counter, that]",[counter]
2,Censorship does not eliminate the censored ind...,0,Censorship does not eliminate the censored ind...,"[censorship, does, not, eliminate, the, censor...","[censorship, eliminate, censored, individual]"
3,"Without the extra population from abortions, h...",0,Without the extra population from abortions ha...,"[without, the, extra, population, from, aborti...","[without, extra, population, abortions, propor..."
4,I can't stand it,1,I cant stand it,"[i, cant, stand, it]","[cant, stand]"


###Preprocessing Data: Using Stemming


In [27]:
ps = nltk.PorterStemmer()

def stemming(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return text

training_data['body_text_stemmed'] = training_data['body_text_nostop'].apply(lambda x: stemming(x))

training_data.head()

Unnamed: 0,text,label,body_text_clean,body_text_tokenized,body_text_nostop,body_text_stemmed
0,Since communism has been relegated to just a h...,0,Since communism has been relegated to just a h...,"[since, communism, has, been, relegated, to, j...","[since, communism, relegated, handful, countries]","[sinc, commun, releg, hand, countri]"
1,Can you counter that?,0,Can you counter that,"[can, you, counter, that]",[counter],[counter]
2,Censorship does not eliminate the censored ind...,0,Censorship does not eliminate the censored ind...,"[censorship, does, not, eliminate, the, censor...","[censorship, eliminate, censored, individual]","[censorship, elimin, censor, individu]"
3,"Without the extra population from abortions, h...",0,Without the extra population from abortions ha...,"[without, the, extra, population, from, aborti...","[without, extra, population, abortions, propor...","[without, extra, popul, abort, proportion, imm..."
4,I can't stand it,1,I cant stand it,"[i, cant, stand, it]","[cant, stand]","[cant, stand]"


###Preprocessing Data: Using a Lemmatizer


In [44]:
nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()

def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

training_data['title'] = training_data['body_text_nostop'].apply(lambda x: lemmatizing(x))

training_data.head(10)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,text,label,body_text_clean,body_text_tokenized,body_text_nostop,body_text_stemmed,body_text_lemmatized,title
0,Since communism has been relegated to just a h...,0,Since communism has been relegated to just a h...,"[since, communism, has, been, relegated, to, j...","[since, communism, relegated, handful, countries]","[sinc, commun, releg, hand, countri]","[since, communism, relegated, handful, country]","[since, communism, relegated, handful, country]"
1,Can you counter that?,0,Can you counter that,"[can, you, counter, that]",[counter],[counter],[counter],[counter]
2,Censorship does not eliminate the censored ind...,0,Censorship does not eliminate the censored ind...,"[censorship, does, not, eliminate, the, censor...","[censorship, eliminate, censored, individual]","[censorship, elimin, censor, individu]","[censorship, eliminate, censored, individual]","[censorship, eliminate, censored, individual]"
3,"Without the extra population from abortions, h...",0,Without the extra population from abortions ha...,"[without, the, extra, population, from, aborti...","[without, extra, population, abortions, propor...","[without, extra, popul, abort, proportion, imm...","[without, extra, population, abortion, proport...","[without, extra, population, abortion, proport..."
4,I can't stand it,1,I cant stand it,"[i, cant, stand, it]","[cant, stand]","[cant, stand]","[cant, stand]","[cant, stand]"
5,This is much more important than it's faithful...,1,This is much more important than its faithfuln...,"[this, is, much, more, important, than, its, f...","[much, important, faithfulness, source, material]","[much, import, faith, sourc, materi]","[much, important, faithfulness, source, material]","[much, important, faithfulness, source, material]"
6,Every thought stands of its own individual mer...,0,Every thought stands of its own individual merits,"[every, thought, stands, of, its, own, individ...","[every, thought, stands, individual, merits]","[everi, thought, stand, individu, merit]","[every, thought, stand, individual, merit]","[every, thought, stand, individual, merit]"
7,you don't have to necessarily think you're bet...,0,you dont have to necessarily think youre bette...,"[you, dont, have, to, necessarily, think, your...","[dont, necessarily, think, youre, better, peop...","[dont, necessarili, think, your, better, peopl...","[dont, necessarily, think, youre, better, peop...","[dont, necessarily, think, youre, better, peop..."
8,You don't understand the signals you're sendin...,1,You dont understand the signals youre sending ...,"[you, dont, understand, the, signals, youre, s...","[dont, understand, signals, youre, sending, ge...","[dont, understand, signal, your, send, get, up...","[dont, understand, signal, youre, sending, get...","[dont, understand, signal, youre, sending, get..."
9,neighbors went out but no one had the head to ...,0,neighbors went out but no one had the head to ...,"[neighbors, went, out, but, no, one, had, the,...","[neighbors, went, one, head, keep, guy, check,...","[neighbor, went, one, head, keep, guy, check, ...","[neighbor, went, one, head, keep, guy, check, ...","[neighbor, went, one, head, keep, guy, check, ..."


In [89]:
training_data['title']

0                                                           [since, communism, relegated, handful, country]
1                                                                                                 [counter]
2                                                             [censorship, eliminate, censored, individual]
3       [without, extra, population, abortion, proportionately, immigration, fill, labor, need, million,...
4                                                                                             [cant, stand]
                                                       ...                                                 
2614                                                             [true, truth, available, u, absolute, way]
2615                [slogan, dont, even, denote, sense, superiority, foreign, country, preference, country]
2616                                                                                   [gtwholebodywhile, ]
2617                        

In [45]:
# We save the file as CSV 
# CSV(Comma-separated-value), It is easier to read compared to tsv(Tab-separated-value) and can be opened on Excel.
training_data.to_csv("SMSSpamCollection_cleaned.csv", sep=',')

##Vectorizing Data: Bag-Of-Words


In [63]:
import pandas as pd
import re
import string
import nltk
pd.set_option('display.max_colwidth', 100) # To extend column width

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("/content/training_data.csv")
print(data.head())
data.drop(['_id', 'id'], axis='columns', inplace=True)


                        _id  ... label
0  60aa7a1732091e6833a2a7ef  ...     0
1  60aa7a1732091e6833a2a7f0  ...     0
2  60aa7a1732091e6833a2a7f1  ...     0
3  60aa7a1732091e6833a2a7f2  ...     0
4  60aa7a1732091e6833a2a7f3  ...     1

[5 rows x 4 columns]


In [64]:
data.columns = ['text', 'label']

In [65]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

In [66]:

from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=clean_text)
X_counts = count_vect.fit_transform(data['text'])
print(X_counts.shape)
print(count_vect.get_feature_names())

(2619, 3904)
['', '0', '000', '0306814897first', '1', '10', '100', '1000', '10000', '1000000', '101', '11', '11000', '12', '125174', '13', '15', '18th', '1980', '19yearold', '1a', '2', '20', '200', '2004', '2006', '2014', '2015', '21', '211', '22', '247', '25', '25002100', '2nd', '3', '30', '31', '31415926', '32', '33', '36', '3dprint', '3g', '4', '400k', '4060', '42', '45', '47', '499999', '4a', '4chan', '5', '50', '500', '50000', '55', '55000', '5even', '5grand', '6', '60', '600', '62', '68', '6th', '7', '70', '75124', '8', '80', '800odd', '80that', '8217murican', '835', '8710', '8710a', '8710alreadi', '8710edit', '8710gt', '8710in', '8710that', '916', '955', '99', 'abandon', 'abhorr', 'abil', 'abl', 'abort', 'abraham', 'absenc', 'absolut', 'abstract', 'absurd', 'abus', 'academ', 'academiclevel', 'accept', 'acceptableinevit', 'acceptablewhat', 'access', 'accid', 'accident', 'accomod', 'accomplish', 'accord', 'account', 'accur', 'accuraci', 'achiev', 'acknowledg', 'acr', 'across', 'ac

In [67]:
X_counts


<2619x3904 sparse matrix of type '<class 'numpy.int64'>'
	with 19644 stored elements in Compressed Sparse Row format>

In [68]:
X_counts_df = pd.DataFrame(X_counts.toarray(), columns=count_vect.get_feature_names())
X_counts_df.head(10)

Unnamed: 0,Unnamed: 1,0,000,0306814897first,1,10,100,1000,10000,1000000,101,11,11000,12,125174,13,15,18th,1980,19yearold,1a,2,20,200,2004,2006,2014,2015,21,211,22,247,25,25002100,2nd,3,30,31,31415926,32,...,wronghttptheconversationcomorderingthevegetarianmealtheresmoreanimalbloodonyourhands4659,wrongopt,wrote,wwi,x,xenophob,xfactor,xiv,xivhttpwwwparadoxplacecomphoto20pagesfrancefrancehistoryimageslouisxivbar800jpg,ye,yeah,year,yell,yellow,yeshttpwwwthescientistcomarticlesviewarticleno40459titleinheritedintelligenceso,yet,you8710,youalso,youand,youd,yougt,youi,youll,young,younger,your,youv,youwhen,zeal,zealand,zealotri,zeitgeist,zero,zionism,zionist,zone,zoo,zoro,à,δ
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [69]:
from sklearn.feature_extraction.text import CountVectorizer

ngram_vect = CountVectorizer(ngram_range=(2,2),analyzer=clean_text) # It applies only bigram vectorizer
X_counts = ngram_vect.fit_transform(data['text'])
print(X_counts.shape)
print(ngram_vect.get_feature_names())

(2619, 3904)
['', '0', '000', '0306814897first', '1', '10', '100', '1000', '10000', '1000000', '101', '11', '11000', '12', '125174', '13', '15', '18th', '1980', '19yearold', '1a', '2', '20', '200', '2004', '2006', '2014', '2015', '21', '211', '22', '247', '25', '25002100', '2nd', '3', '30', '31', '31415926', '32', '33', '36', '3dprint', '3g', '4', '400k', '4060', '42', '45', '47', '499999', '4a', '4chan', '5', '50', '500', '50000', '55', '55000', '5even', '5grand', '6', '60', '600', '62', '68', '6th', '7', '70', '75124', '8', '80', '800odd', '80that', '8217murican', '835', '8710', '8710a', '8710alreadi', '8710edit', '8710gt', '8710in', '8710that', '916', '955', '99', 'abandon', 'abhorr', 'abil', 'abl', 'abort', 'abraham', 'absenc', 'absolut', 'abstract', 'absurd', 'abus', 'academ', 'academiclevel', 'accept', 'acceptableinevit', 'acceptablewhat', 'access', 'accid', 'accident', 'accomod', 'accomplish', 'accord', 'account', 'accur', 'accuraci', 'achiev', 'acknowledg', 'acr', 'across', 'ac

In [70]:

X_counts_df = pd.DataFrame(X_counts.toarray(), columns=ngram_vect.get_feature_names())
X_counts_df.head(10)

Unnamed: 0,Unnamed: 1,0,000,0306814897first,1,10,100,1000,10000,1000000,101,11,11000,12,125174,13,15,18th,1980,19yearold,1a,2,20,200,2004,2006,2014,2015,21,211,22,247,25,25002100,2nd,3,30,31,31415926,32,...,wronghttptheconversationcomorderingthevegetarianmealtheresmoreanimalbloodonyourhands4659,wrongopt,wrote,wwi,x,xenophob,xfactor,xiv,xivhttpwwwparadoxplacecomphoto20pagesfrancefrancehistoryimageslouisxivbar800jpg,ye,yeah,year,yell,yellow,yeshttpwwwthescientistcomarticlesviewarticleno40459titleinheritedintelligenceso,yet,you8710,youalso,youand,youd,yougt,youi,youll,young,younger,your,youv,youwhen,zeal,zealand,zealotri,zeitgeist,zero,zionism,zionist,zone,zoo,zoro,à,δ
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['text'])
print(X_tfidf.shape)
print(tfidf_vect.get_feature_names())

(2619, 3904)
['', '0', '000', '0306814897first', '1', '10', '100', '1000', '10000', '1000000', '101', '11', '11000', '12', '125174', '13', '15', '18th', '1980', '19yearold', '1a', '2', '20', '200', '2004', '2006', '2014', '2015', '21', '211', '22', '247', '25', '25002100', '2nd', '3', '30', '31', '31415926', '32', '33', '36', '3dprint', '3g', '4', '400k', '4060', '42', '45', '47', '499999', '4a', '4chan', '5', '50', '500', '50000', '55', '55000', '5even', '5grand', '6', '60', '600', '62', '68', '6th', '7', '70', '75124', '8', '80', '800odd', '80that', '8217murican', '835', '8710', '8710a', '8710alreadi', '8710edit', '8710gt', '8710in', '8710that', '916', '955', '99', 'abandon', 'abhorr', 'abil', 'abl', 'abort', 'abraham', 'absenc', 'absolut', 'abstract', 'absurd', 'abus', 'academ', 'academiclevel', 'accept', 'acceptableinevit', 'acceptablewhat', 'access', 'accid', 'accident', 'accomod', 'accomplish', 'accord', 'account', 'accur', 'accuraci', 'achiev', 'acknowledg', 'acr', 'across', 'ac

In [72]:

X_tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vect.get_feature_names())
X_tfidf_df.head(10)

Unnamed: 0,Unnamed: 1,0,000,0306814897first,1,10,100,1000,10000,1000000,101,11,11000,12,125174,13,15,18th,1980,19yearold,1a,2,20,200,2004,2006,2014,2015,21,211,22,247,25,25002100,2nd,3,30,31,31415926,32,...,wronghttptheconversationcomorderingthevegetarianmealtheresmoreanimalbloodonyourhands4659,wrongopt,wrote,wwi,x,xenophob,xfactor,xiv,xivhttpwwwparadoxplacecomphoto20pagesfrancefrancehistoryimageslouisxivbar800jpg,ye,yeah,year,yell,yellow,yeshttpwwwthescientistcomarticlesviewarticleno40459titleinheritedintelligenceso,yet,you8710,youalso,youand,youd,yougt,youi,youll,young,younger,your,youv,youwhen,zeal,zealand,zealotri,zeitgeist,zero,zionism,zionist,zone,zoo,zoro,à,δ
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.617801,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.183969,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [73]:
print(X_tfidf_df.loc[(X_tfidf_df!=0).any(axis=1)])


                  0  000  0306814897first    1  ...  zone  zoo  zoro    à    δ
0     0.000000  0.0  0.0              0.0  0.0  ...   0.0  0.0   0.0  0.0  0.0
1     0.000000  0.0  0.0              0.0  0.0  ...   0.0  0.0   0.0  0.0  0.0
2     0.000000  0.0  0.0              0.0  0.0  ...   0.0  0.0   0.0  0.0  0.0
3     0.000000  0.0  0.0              0.0  0.0  ...   0.0  0.0   0.0  0.0  0.0
4     0.000000  0.0  0.0              0.0  0.0  ...   0.0  0.0   0.0  0.0  0.0
...        ...  ...  ...              ...  ...  ...   ...  ...   ...  ...  ...
2614  0.000000  0.0  0.0              0.0  0.0  ...   0.0  0.0   0.0  0.0  0.0
2615  0.000000  0.0  0.0              0.0  0.0  ...   0.0  0.0   0.0  0.0  0.0
2616  0.322288  0.0  0.0              0.0  0.0  ...   0.0  0.0   0.0  0.0  0.0
2617  0.000000  0.0  0.0              0.0  0.0  ...   0.0  0.0   0.0  0.0  0.0
2618  0.000000  0.0  0.0              0.0  0.0  ...   0.0  0.0   0.0  0.0  0.0

[2612 rows x 3904 columns]


##Feature Engineering: Feature Creation


In [74]:
data = pd.read_csv("/content/training_data.csv")
print(data.head())
data.drop(['_id', 'id'], axis='columns', inplace=True)

                        _id  ... label
0  60aa7a1732091e6833a2a7ef  ...     0
1  60aa7a1732091e6833a2a7f0  ...     0
2  60aa7a1732091e6833a2a7f1  ...     0
3  60aa7a1732091e6833a2a7f2  ...     0
4  60aa7a1732091e6833a2a7f3  ...     1

[5 rows x 4 columns]


In [76]:
import string

# Function to calculate length of message excluding space
data['body_len'] = data['text'].apply(lambda x: len(x) - x.count(" "))

data.head()

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['punct%'] = data['text'].apply(lambda x: count_punct(x))

data.head()

Unnamed: 0,text,label,body_len,punct%
0,Since communism has been relegated to just a handful of countries,0,55,0.0
1,Can you counter that?,0,18,5.6
2,Censorship does not eliminate the censored individual.,0,48,2.1
3,"Without the extra population from abortions, has there been proportionately more immigration to ...",0,200,1.5
4,I can't stand it,1,13,7.7


In [77]:
#For Visualizing to check if message body length and % of punctuation is good feature or not.
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [83]:
bins = np.linspace(0, 200, 40)

#plt.hist(data[data['label']=='0']['body_len'], bins, alpha=0.5, normed=True, label='0')
#plt.hist(data[data['label']=='1']['body_len'], bins, alpha=0.5, normed=True, label='1')
#plt.legend(loc='upper left')
#plt.show()

##Building ML Classifiers:Model Selection Random Forest with GridSearchCV


###Exploring parameters using GridSearchCV


In [93]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

# TF-IDF
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['text'])
X_tfidf_feat = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_tfidf.toarray())], axis=1)

# CountVectorizer
count_vect = CountVectorizer(analyzer=clean_text)
X_count = count_vect.fit_transform(data['text'])
X_count_feat = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_count.toarray())], axis=1)

X_count_feat.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,...,3864,3865,3866,3867,3868,3869,3870,3871,3872,3873,3874,3875,3876,3877,3878,3879,3880,3881,3882,3883,3884,3885,3886,3887,3888,3889,3890,3891,3892,3893,3894,3895,3896,3897,3898,3899,3900,3901,3902,3903
0,55,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,18,5.6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,48,2.1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,200,1.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,13,7.7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


###For CountVectorizer


In [94]:
rf = RandomForestClassifier()
param = {'n_estimators': [10, 150, 300],
        'max_depth': [30, 60, 90, None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)# n_jobs=-1 for parallelizing search
gs_fit = gs.fit(X_count_feat, data['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
11,11.243,2.01638,0.153035,0.023881,,300,"{'max_depth': None, 'n_estimators': 300}",0.696565,0.698473,0.717557,0.698473,0.720841,0.706382,0.01054,1
7,4.893926,0.039357,0.07306,0.00217,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.688931,0.698473,0.709924,0.696565,0.703633,0.699505,0.00703,2
10,6.145981,0.055247,0.092729,0.001964,,150,"{'max_depth': None, 'n_estimators': 150}",0.677481,0.692748,0.709924,0.700382,0.709369,0.697981,0.012048,3
8,9.70629,0.134805,0.129772,0.006679,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.700382,0.687023,0.700382,0.681298,0.709369,0.695691,0.010133,4
4,3.929487,0.03437,0.062038,0.002027,60.0,150,"{'max_depth': 60, 'n_estimators': 150}",0.696565,0.677481,0.696565,0.692748,0.697897,0.692251,0.007583,5
