In [1]:
import pandas as pd
import numpy as np

In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [3]:
df = pd.read_csv('train.csv')

In [4]:
df.head(20)

Unnamed: 0,textID,text,selected_text,sentiment
0,a3d0a7d5ad,Spent the entire morning in a meeting w/ a ven...,my boss was not happy w/ them. Lots of fun.,neutral
1,251b6a6766,Oh! Good idea about putting them on ice cream,Good,positive
2,c9e8d1ef1c,says good (or should i say bad?) afternoon! h...,says good (or should i say bad?) afternoon!,neutral
3,f14f087215,i dont think you can vote anymore! i tried,i dont think you can vote anymore!,negative
4,bf7473b12d,haha better drunken tweeting you mean?,better,positive
5,1915bebcb3,headache wanna see my Julie,headache,negative
6,2ab82634d5,had an awsome salad! I recommend getting the S...,had an awsome salad!,positive
7,a5a1c996c0,fine! Going to do my big walk today 20 or so ...,fine!,positive
8,a182b2638e,Thank a yoou how are you? #TwitterTakeover,Thank,positive
9,1dcb6fdb13,Why don't adobe realise no one WANTS to pay fo...,Why don't adobe realise no one WANTS to pay fo...,neutral


In [5]:
df.shape

(27486, 4)

In [6]:
df['text'][0]

'Spent the entire morning in a meeting w/ a vendor, and my boss was not happy w/ them. Lots of fun.  I had other plans for my morning'

In [7]:
doc = nlp(df['text'][0])

In [8]:
[token.text for token in doc]

['Spent',
 'the',
 'entire',
 'morning',
 'in',
 'a',
 'meeting',
 'w/',
 'a',
 'vendor',
 ',',
 'and',
 'my',
 'boss',
 'was',
 'not',
 'happy',
 'w/',
 'them',
 '.',
 'Lots',
 'of',
 'fun',
 '.',
 ' ',
 'I',
 'had',
 'other',
 'plans',
 'for',
 'my',
 'morning']

In [9]:
token_table = []
for token in doc:
    token_table.append({'Text':token.text, 
                       'Lemma':token.lemma_,
                       'POS':token.pos_, 
                       'TAG':token.tag_, 
                       'DEP':token.dep_,
                       'Shape':token.shape_, 
                       'Alpha':token.is_alpha, 
                       'Stop':token.is_stop})
display(pd.DataFrame(token_table))

Unnamed: 0,Text,Lemma,POS,TAG,DEP,Shape,Alpha,Stop
0,Spent,spend,VERB,VBD,ROOT,Xxxxx,True,False
1,the,the,DET,DT,det,xxx,True,True
2,entire,entire,ADJ,JJ,amod,xxxx,True,False
3,morning,morning,NOUN,NN,dobj,xxxx,True,False
4,in,in,ADP,IN,prep,xx,True,True
5,a,a,DET,DT,det,x,True,True
6,meeting,meeting,NOUN,NN,pobj,xxxx,True,False
7,w/,w/,ADV,WRB,dep,x/,False,False
8,a,a,DET,DT,det,x,True,True
9,vendor,vendor,NOUN,NN,dobj,xxxx,True,False


## Preprocessing

In [10]:
df['sentiment'].replace({'positive':1, 'neutral':0, 'negative':-1}, inplace=True)

In [11]:
df['sentiment'].value_counts()

 0    11118
 1     8582
-1     7786
Name: sentiment, dtype: int64

In [12]:
df.isna().sum()

textID           0
text             1
selected_text    1
sentiment        0
dtype: int64

In [13]:
df[df['text'].isna()]

Unnamed: 0,textID,text,selected_text,sentiment
13133,fdb77c3752,,,0


In [14]:
df.dropna(inplace=True)

In [15]:
df.isna().sum()

textID           0
text             0
selected_text    0
sentiment        0
dtype: int64

In [48]:
data = df.drop(['textID'], axis=1)
data.head()

Unnamed: 0,text,selected_text,sentiment
0,Spent the entire morning in a meeting w/ a ven...,my boss was not happy w/ them. Lots of fun.,0
1,Oh! Good idea about putting them on ice cream,Good,1
2,says good (or should i say bad?) afternoon! h...,says good (or should i say bad?) afternoon!,0
3,i dont think you can vote anymore! i tried,i dont think you can vote anymore!,-1
4,haha better drunken tweeting you mean?,better,1


In [55]:
data['text'][2]

'says good (or should i say bad?) afternoon!  http://plurk.com/p/wxpdj'

## Tokenization

### NLP Pipeline

In [17]:
sent = nlp.create_pipe('sentencizer')

In [18]:
# add sentencizer before parser in nlp pipeline
nlp.add_pipe(sent, before='parser')

In [19]:
doc = nlp(data['text'][0])

In [20]:
for sent in doc.sents:
    print(sent)

Spent the entire morning in a meeting w/ a vendor, and my boss was not happy w/ them.
Lots of fun.
 I had other plans for my morning


### Stop Words

In [21]:
from spacy.lang.en.stop_words import STOP_WORDS

In [22]:
stopwords = list(STOP_WORDS)

In [46]:
#print(stopwords)

### Punctuations

In [24]:
import string

In [25]:
punct = string.punctuation

In [26]:
punct

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

### Cleaning

In [27]:
def text_data_cleaning(sentence):
    doc = nlp(sentence)
    
    tokens = []
    for token in doc:
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
    
    cleaned_tokens = []
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_tokens.append(token)
    return cleaned_tokens

In [28]:
data['text'][0]

'Spent the entire morning in a meeting w/ a vendor, and my boss was not happy w/ them. Lots of fun.  I had other plans for my morning'

In [58]:
text_data_cleaning(data['text'][0])

['spend',
 'entire',
 'morning',
 'meeting',
 'w/',
 'vendor',
 'boss',
 'happy',
 'w/',
 'lot',
 'fun',
 'plan',
 'morning']

### TF-IDF

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [31]:
tfidf = TfidfVectorizer(tokenizer = text_data_cleaning)

## Base Model

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 

In [33]:
X = data['text']
y = data['sentiment']

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [43]:
tokenized_dt = tfidf.fit(X_train)
token_dt = tfidf.transform(X_train)
tok_data = pd.DataFrame(columns=tokenized_dt.get_feature_names(), data=token_dt.toarray())

In [44]:
tok_data.head()

Unnamed: 0,&lt;3,&quot,&quot;breakfast&quot,&quot;look,&quot;peter,'','em,'t,(:,(p.s,...,ï¿½iï¿½m,ï¿½stupidityï¿½,ï¿½tearï¿½,ï¿½timo,ï¿½ureo,ï¿½why,ï¿½whyyy,ï¿½you,ï¿½ï¿½,ï¿½ï¿½h
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Linear SVC

In [None]:
from sklearn.svm import LinearSVC

In [None]:
svc = LinearSVC()

In [None]:
X_train.shape, X_test.shape

In [None]:
svc_pipe = Pipeline([('tfidf', tfidf), ('LinearSVC', svc)])

In [None]:
svc_pipe.fit(X_train, y_train)

In [None]:
svc_pipe.score(X_test, y_test)

In [None]:
pred = svc_pipe.predict(X_test)

In [None]:
print(classification_report(y_test, pred))

In [None]:
confusion_matrix(y_test, pred)

### XGB

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb = XGBClassifier()

In [None]:
xgb_pipe = Pipeline([('tfidf', tfidf), ('XGB', xgb)])

In [None]:
xgb_pipe.fit(X_train, y_train)

In [None]:
pred = xgb_pipe.predict(X_test)

In [None]:
print(classification_report(y_test, pred))

In [None]:
confusion_matrix(y_test, pred)