# Text Preprocessing

## Tokenizing

In [2]:
text = "This is Andrew's text, isn't it?"

In [118]:
# tokenizing using string commands
# needs text preprocessing before
def text_token(text,sep=" "):
    text = text.lower().split()
    return text
text_token(text)

['this', 'is', "andrew's", 'text,', "isn't", 'it?']

In [119]:
#tokenizing using sklearn built-in functions
#needs text preprocessing before

In [3]:
#tokenizing using NLTK built-in functions
from nltk.tokenize import WordPunctTokenizer , SpaceTokenizer , TreebankWordTokenizer 

Tokenizer_1 = WordPunctTokenizer()
Tokenizer_2 = SpaceTokenizer()
Tokenizer_3 = TreebankWordTokenizer()

print("Tokenizer 1 :",Tokenizer_1.tokenize(text)) 
print("Tokenizer 2 :",Tokenizer_2.tokenize(text)) 
print("Tokenizer 3 :",Tokenizer_3.tokenize(text)) 

Tokenizer 1 : ['This', 'is', 'Andrew', "'", 's', 'text', ',', 'isn', "'", 't', 'it', '?']
Tokenizer 2 : ['This', 'is', "Andrew's", 'text,', "isn't", 'it?']
Tokenizer 3 : ['This', 'is', 'Andrew', "'s", 'text', ',', 'is', "n't", 'it', '?']


## Token Normalization 

In [4]:
# Stemming 
from nltk.stem import PorterStemmer
text = "I was walking with my friends , didn't i ?"
Tokens = Tokenizer_3.tokenize(text)

Stemmer = PorterStemmer()

" ".join(Stemmer.stem(token) for token in Tokens)


"I wa walk with my friend , did n't i ?"

In [5]:
# Lemmatization
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download("wordnet")
Tokens = Tokenizer_3.tokenize(text)

Lemmatizer = WordNetLemmatizer()

" ".join(Lemmatizer.lemmatize(token) for token in Tokens)

[nltk_data] Downloading package wordnet to /home/mohamed/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


"I wa walking with my friend , did n't i ?"

# Feature Extraction

## BAG OF WORDS

In [6]:
import pandas as pd 
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer , HashingVectorizer
from sklearn.model_selection import train_test_split , KFold , RandomizedSearchCV 
import seaborn as sns

In [8]:
# kaggle sentiment analysis dataset
df = pd.read_csv("data/sen/train.tsv",sep="\t")

In [9]:
df.tail(20)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
156040,156041,8544,"is darkly atmospheric , with Herrmann quietly ...",2
156041,156042,8544,"is darkly atmospheric , with Herrmann quietly ...",2
156042,156043,8544,"is darkly atmospheric ,",2
156043,156044,8544,is darkly atmospheric,3
156044,156045,8544,with Herrmann quietly suggesting the sadness a...,2
156045,156046,8544,Herrmann quietly suggesting the sadness and ob...,2
156046,156047,8544,Herrmann,2
156047,156048,8544,quietly suggesting the sadness and obsession b...,1
156048,156049,8544,suggesting the sadness and obsession beneath H...,2
156049,156050,8544,suggesting the sadness and obsession,2


In [10]:
data = df.groupby("SentenceId").mean()["Sentiment"]
df["new"] = df["SentenceId"].apply(lambda x : data[x])


In [11]:
df.head(100)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,new
0,1,1,A series of escapades demonstrating the adage ...,1,1.984127
1,2,1,A series of escapades demonstrating the adage ...,2,1.984127
2,3,1,A series,2,1.984127
3,4,1,A,2,1.984127
4,5,1,series,2,1.984127
...,...,...,...,...,...
95,96,3,", I suspect ,",2,1.714286
96,97,3,"I suspect ,",2,1.714286
97,98,3,I suspect,2,1.714286
98,99,3,I,2,1.714286


In [12]:
df["Sentiment"].value_counts()/df.shape[0]

2    0.509945
3    0.210989
1    0.174760
4    0.058990
0    0.045316
Name: Sentiment, dtype: float64

In [13]:
df["Phrase"]=df["Phrase"].str.lower()

In [14]:
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,new
0,1,1,a series of escapades demonstrating the adage ...,1,1.984127
1,2,1,a series of escapades demonstrating the adage ...,2,1.984127
2,3,1,a series,2,1.984127
3,4,1,a,2,1.984127
4,5,1,series,2,1.984127


In [15]:
## Data Cleaning Pipe line 
import re
import string

def pipeline(text):
    
    tokens = Tokenizer_3.tokenize(text)
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    remove_digits = str.maketrans('', '', string.digits)
    text = " ".join(Lemmatizer.lemmatize(t) for t in stripped)
    text = text.strip()
    text = text.translate(remove_digits)
    
    return text
df["Phrase"]=df["Phrase"].apply(lambda x : pipeline(x))


In [16]:
corpus = df["Phrase"].values
y_train = df["Sentiment"].values

In [17]:
vectorizer = HashingVectorizer(stop_words="english",ngram_range=(1,5))
X = vectorizer.fit_transform(corpus)

In [76]:
X_train , X_val , y_train , y_val = train_test_split(X,y_train,test_size=0.2)

In [77]:
from sklearn.ensemble import AdaBoostClassifier , RandomForestClassifier , GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression , RidgeClassifier 
from sklearn.metrics import accuracy_score , confusion_matrix 
from sklearn.naive_bayes import BernoulliNB , ClassifierMixin , GaussianNB , ComplementNB 
from sklearn.neighbors import KNeighborsClassifier
clf_1 = RandomForestClassifier()

In [78]:
def evaluate(clf):
    
    clf.fit(X_train,y_train)
    preds = clf.predict(X_val)
    preds_train = clf.predict(X_train)
    print("accuracy :",accuracy_score(y_val,preds))
    print("accuracy_train :",accuracy_score(y_train,preds_train))


In [79]:
evaluate(LogisticRegression())

accuracy : 0.626393694732795
accuracy_train : 0.707884787902089


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
