In [1]:
import pandas as pd
import numpy as np

In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [3]:
df = pd.read_csv('train.csv')

In [4]:
df.head(40)

Unnamed: 0,textID,text,selected_text,sentiment
0,a3d0a7d5ad,Spent the entire morning in a meeting w/ a ven...,my boss was not happy w/ them. Lots of fun.,neutral
1,251b6a6766,Oh! Good idea about putting them on ice cream,Good,positive
2,c9e8d1ef1c,says good (or should i say bad?) afternoon! h...,says good (or should i say bad?) afternoon!,neutral
3,f14f087215,i dont think you can vote anymore! i tried,i dont think you can vote anymore!,negative
4,bf7473b12d,haha better drunken tweeting you mean?,better,positive
5,1915bebcb3,headache wanna see my Julie,headache,negative
6,2ab82634d5,had an awsome salad! I recommend getting the S...,had an awsome salad!,positive
7,a5a1c996c0,fine! Going to do my big walk today 20 or so ...,fine!,positive
8,a182b2638e,Thank a yoou how are you? #TwitterTakeover,Thank,positive
9,1dcb6fdb13,Why don't adobe realise no one WANTS to pay fo...,Why don't adobe realise no one WANTS to pay fo...,neutral


In [5]:
df.shape

(27486, 4)

In [6]:
df['text'][2].replace('[^a-zA-Z#]', ' ')

'says good (or should i say bad?) afternoon!  http://plurk.com/p/wxpdj'

In [7]:
doc = nlp(df['text'][2])

In [8]:
[token.text for token in doc]

['says',
 'good',
 '(',
 'or',
 'should',
 'i',
 'say',
 'bad',
 '?',
 ')',
 'afternoon',
 '!',
 ' ',
 'http://plurk.com/p/wxpdj']

In [9]:
token_table = []
for token in doc:
    token_table.append({'Text':token.text, 
                       'Lemma':token.lemma_,
                       'POS':token.pos_, 
                       'TAG':token.tag_, 
                       'DEP':token.dep_,
                       'Shape':token.shape_, 
                       'Alpha':token.is_alpha, 
                       'Stop':token.is_stop})
display(pd.DataFrame(token_table))

Unnamed: 0,Text,Lemma,POS,TAG,DEP,Shape,Alpha,Stop
0,says,say,VERB,VBZ,ROOT,xxxx,True,False
1,good,good,ADJ,JJ,nsubj,xxxx,True,False
2,(,(,PUNCT,-LRB-,punct,(,False,False
3,or,or,CCONJ,CC,cc,xx,True,True
4,should,should,VERB,MD,aux,xxxx,True,True
5,i,i,PRON,PRP,nsubj,x,True,True
6,say,say,VERB,VB,ROOT,xxx,True,True
7,bad,bad,ADJ,JJ,acomp,xxx,True,False
8,?,?,PUNCT,.,punct,?,False,False
9,),),PUNCT,-RRB-,punct,),False,False


## Preprocessing

In [10]:
df['sentiment'].replace({'positive':1, 'neutral':0, 'negative':-1}, inplace=True)

In [11]:
df['sentiment'].value_counts()

 0    11118
 1     8582
-1     7786
Name: sentiment, dtype: int64

In [12]:
df.isna().sum()

textID           0
text             1
selected_text    1
sentiment        0
dtype: int64

In [13]:
df[df['text'].isna()]

Unnamed: 0,textID,text,selected_text,sentiment
13133,fdb77c3752,,,0


In [14]:
df.dropna(inplace=True)

In [15]:
df.isna().sum()

textID           0
text             0
selected_text    0
sentiment        0
dtype: int64

In [16]:
data = df.drop(['textID'], axis=1)
data.head()

Unnamed: 0,text,selected_text,sentiment
0,Spent the entire morning in a meeting w/ a ven...,my boss was not happy w/ them. Lots of fun.,0
1,Oh! Good idea about putting them on ice cream,Good,1
2,says good (or should i say bad?) afternoon! h...,says good (or should i say bad?) afternoon!,0
3,i dont think you can vote anymore! i tried,i dont think you can vote anymore!,-1
4,haha better drunken tweeting you mean?,better,1


In [17]:
data['text'][2]

'says good (or should i say bad?) afternoon!  http://plurk.com/p/wxpdj'

## Cleaning

### NLP Pipeline

In [18]:
sent = nlp.create_pipe('sentencizer')

In [19]:
# add sentencizer before parser in nlp pipeline
nlp.add_pipe(sent, before='parser')

In [20]:
doc = nlp(data['text'][0])

In [21]:
for sent in doc.sents:
    print(sent)

Spent the entire morning in a meeting w/ a vendor, and my boss was not happy w/ them.
Lots of fun.
 I had other plans for my morning


### Stop Words

In [22]:
from spacy.lang.en.stop_words import STOP_WORDS

In [23]:
stopwords = list(STOP_WORDS)

In [24]:
#print(stopwords)

### Punctuations

In [25]:
import string

In [26]:
punct = string.punctuation

In [27]:
punct

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

### Cleaning

In [28]:
import re

In [29]:
def text_data_cleaning(text):
    #text = text.str.replace('[^a-zA-Z#]', ' ')
    
    doc = nlp(text)
    
    tokens = []
    for token in doc:
        if token.pos_ == "NUM": #remove numbers and urls
            temp = ''
        elif token.lemma_ != "-PRON-": #remove prounouns and change words to lower case
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_ #change words to lower case
        tokens.append(temp)
    
    tokens = [re.sub(r'[^a-zA-Z#]', ' ', file) for file in tokens] #replace special characters to whitespace
    
    tokens = [re.sub(r'\s', '', file) for file in tokens] #remove whitespaces
    
    cleaned_tokens = []
    for token in tokens:
        if token not in stopwords and token not in punct: #remove stop words and punctuations
            cleaned_tokens.append(token)
    
    long_tokens = []
    for token in cleaned_tokens: #remove words with length shorter than 3
        if len(token) >= 3:
            long_tokens.append(token)
    
    return long_tokens

In [30]:
data['text'][20]

' ....welcome to public transport  X'

In [31]:
text_data_cleaning(data['text'][20])

['welcome', 'public', 'transport']

### TF-IDF

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [33]:
tfidf = TfidfVectorizer(tokenizer = text_data_cleaning)

In [34]:
#token_text = tfidf.fit_transform(data['text'][:11])

In [35]:
#pd.DataFrame(token_text[0].T.todense(), index=tfidf.get_feature_names(), columns=["tfidf"])

## Tokenization

In [36]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 

In [37]:
X = data['text']
y = data['sentiment']

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [39]:
#tokenized_dt = tfidf.fit(X_train)
#token_dt = tfidf.transform(X_train)
#tok_data = pd.DataFrame(columns=tokenized_dt.get_feature_names(), data=token_dt.toarray())

In [40]:
#tokenized_dt.get_feature_names()

In [41]:
#tok_data.head()

## Base Model

### Linear SVC

In [42]:
from sklearn.svm import LinearSVC

In [43]:
svc = LinearSVC()

In [44]:
X_train.shape, X_test.shape

((21988,), (5497,))

In [45]:
svc_pipe = Pipeline([('tfidf', tfidf), ('LinearSVC', svc)])

In [46]:
svc_pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function text_data_cleaning at 0x109b17048>,
                                 use_idf=True, vocabulary=None)),
                ('LinearSVC',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_interc

In [47]:
svc_pipe.score(X_test, y_test)

0.6623612879752593

In [48]:
pred = svc_pipe.predict(X_test)

In [49]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

          -1       0.66      0.62      0.64      1543
           0       0.63      0.66      0.65      2259
           1       0.71      0.70      0.70      1695

    accuracy                           0.66      5497
   macro avg       0.67      0.66      0.66      5497
weighted avg       0.66      0.66      0.66      5497



In [50]:
confusion_matrix(y_test, pred)

array([[ 962,  489,   92],
       [ 376, 1499,  384],
       [ 118,  397, 1180]])

### XGB

In [51]:
from xgboost import XGBClassifier

In [52]:
xgb = XGBClassifier()

In [53]:
xgb_pipe = Pipeline([('tfidf', tfidf), ('XGB', xgb)])

In [54]:
xgb_pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, learning_rate=0.1,
                               max_delta_step=0, max_depth=3,
                        

In [55]:
pred = xgb_pipe.predict(X_test)

In [56]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

          -1       0.78      0.42      0.54      1543
           0       0.56      0.85      0.68      2259
           1       0.78      0.58      0.67      1695

    accuracy                           0.65      5497
   macro avg       0.71      0.62      0.63      5497
weighted avg       0.69      0.65      0.64      5497



In [57]:
confusion_matrix(y_test, pred)

array([[ 642,  823,   78],
       [ 142, 1919,  198],
       [  44,  662,  989]])