In [63]:
import pandas as pd
import re
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("IMDB Dataset.csv")

In [7]:
df['sentiment']=df['sentiment'].map({'negative':0,'positive':1})

In [9]:
df.shape

(50000, 2)

In [14]:
df.sample(10)

Unnamed: 0,review,sentiment
36176,"""Yagyu ichizoku no inbo"" (let's just say ""The ...",0
37252,American Pie: Beta House is sort of in limbo b...,0
21410,Bled is a very apt title for this As you watch...,0
20393,The story of the film was as simple minded as ...,0
22915,i went into watching this movie knowing it was...,0
11511,You will marvel at the incredibly sophisticate...,1
34144,"I expected a bad movie, and got a bad movie. B...",0
35095,This was the best Muppet movie I've seen ever!...,1
30754,Good show.<br /><br />The basic background is ...,1
8692,"This is a crummy film, a pretender to a genre ...",0


In [17]:
df['sentiment'].value_counts()

1    25000
0    25000
Name: sentiment, dtype: int64

#### it is the balanced data

In [21]:
# converting text to lower case
df['review'] = df['review'].apply(lambda x : x.lower())

In [26]:
a = "ain't aren't i'd"
expand_contraction(a)

'is not are not i would'

In [27]:
# expanding the contraction

contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }

def expand_contraction(x):
    for key,item in contraction_mapping.items():
        x=x.replace(key,item,-1)
    return x



In [32]:
df['review']= df['review'].apply(lambda x : expand_contraction((x)))

In [36]:
# removing mails

df['review']=df['review'].apply(lambda x : (re.sub('\S+@\S+\.com','',x)))

In [37]:
# removing URL

regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
df['review']=df['review'].apply(lambda x : re.sub(regex, '', x))

In [38]:
# removing special character
df['review']=df['review'].apply(lambda x : re.sub('[^\w ]+','',x))

In [39]:
# removing multiple spaces

df['review']=df['review'].apply(lambda x : re.sub('\s{2,}',' ',x))

In [41]:
# removing HTML

from bs4 import BeautifulSoup 
df['review']=df['review'].apply(lambda x : BeautifulSoup(x,'html').get_text().strip())

In [42]:
# removing accented chars
import unicodedata
def remove_accented_chars(x):
    x = unicodedata.normalize('NFKD', x).encode('ascii','ignore').decode('utf-8','ignore')
    return x

In [43]:
df['review']=df['review'].apply(lambda x : remove_accented_chars(x))

In [44]:
# remove stopwords
from spacy.lang.en.stop_words import STOP_WORDS as stopwords
def remove_stopwords(x):
    return ' '.join([w for w in x.split() if w not in stopwords])


df['review']=df['review'].apply(lambda x : remove_stopwords(x))

In [45]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [46]:
def convert_to_root(x):
    doc= nlp(x)
    x_list=[]
    for w in doc:
        root = w.lemma_
        x_list.append(root)
    return ' '.join(x_list)

In [47]:
df['review']=df['review'].apply(lambda x : convert_to_root(x))

### Data Preparation for model training

In [48]:
from sklearn.model_selection import train_test_split

In [50]:
X = df['review']
y = df['sentiment']

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Modelling with pipeline 

In [55]:
pipe = Pipeline([('tfidf',TfidfVectorizer()),
                ('clf',LogisticRegression())
                ])

In [60]:
# running alone without gridsearchcv
pipe.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [61]:
pipe.score(X_test,y_test)

0.8876363636363637

In [62]:
# adding hyperparameter

hyperparameters = {
                'tfidf__max_df':(0.5,1.0),
                'tfidf__ngram_range':((1,1),(1,2)),
                'tfidf__use_idf':(True,False),
                'tfidf__analyzer':('word','char','char_wb'),
                'clf__penalty':('l2','l1'),
                'clf__C':(1,2)
                }

In [64]:
clf = GridSearchCV(pipe, hyperparameters,n_jobs=-1,cv= None)

In [65]:
%%time
clf.fit(X_train,y_train)

Wall time: 1h 13min


GridSearchCV(cv=None, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'tfidf__max_df': (0.5, 1.0), 'tfidf__ngram_range': ((1, 1), (1, 2)), 'tfidf__use_idf': (True, False), 'tfidf__analyzer': ('word', 'char', 'char_wb'), 'clf__penalty': ('l2', 'l1'), 'clf__C': (1, 2)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [66]:
clf.best_estimator_

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [67]:
clf.best_params_

{'clf__C': 2,
 'clf__penalty': 'l2',
 'tfidf__analyzer': 'word',
 'tfidf__max_df': 1.0,
 'tfidf__ngram_range': (1, 1),
 'tfidf__use_idf': True}

In [68]:
clf.best_score_

0.8862686567164179

In [70]:
clf.best_index_

52

In [71]:
# now once we got the best model, than testing

In [72]:
y_pred=clf.predict(X_test)

In [73]:
from sklearn.metrics import confusion_matrix , classification_report

In [74]:
confusion_matrix(y_test,y_pred)

array([[7221,  987],
       [ 781, 7511]], dtype=int64)

In [76]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.90      0.88      0.89      8208
           1       0.88      0.91      0.89      8292

   micro avg       0.89      0.89      0.89     16500
   macro avg       0.89      0.89      0.89     16500
weighted avg       0.89      0.89      0.89     16500



## good results... we did hyperparameter tuning with solid results

In [77]:
ls

 Volume in drive C is Windows-SSD
 Volume Serial Number is 6018-D6B6

 Directory of C:\MyCode\GitHub\NLP\Text cleaning and preprocessing

27.10.2020  13:12    <DIR>          .
27.10.2020  13:12    <DIR>          ..
26.10.2020  15:50    <DIR>          .ipynb_checkpoints
26.10.2020  13:18            13.341 GridSearchCV.ipynb
21.10.2020  12:14            20.383 Handling Textfile JSON PDF Audio speech.ipynb
26.10.2020  15:53        66.212.309 IMDB Dataset.csv
27.10.2020  13:12            21.539 IMDB Sentiment analysis.ipynb
26.10.2020  13:42             6.404 Multiple model check.ipynb
26.10.2020  14:17             4.730 Random Search ( fine tuning).ipynb
20.10.2020  15:59                50 README.md
20.10.2020  16:05            14.999 Regular Expression.ipynb
23.10.2020  10:36           332.633 SPAM Text Classification.ipynb
22.10.2020  09:08           503.663 spam.csv
26.10.2020  12:00           160.393 Text cleaning & preprocessing.ipynb
23.10.2020  16:46            18.883 Twitter Senti