In [1]:
import pandas as pd
from textblob import TextBlob
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import re
import spacy

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
tokenizer = ToktokTokenizer()
nlp = spacy.load('en_core_web_sm',disable=['ner'])

In [4]:
TextBlob("he is very intelligent").sentiment

Sentiment(polarity=1.0, subjectivity=1.0)

In [5]:
TextBlob("sun rises in the east").sentiment

Sentiment(polarity=0.0, subjectivity=0.0)

In [6]:
TextBlob("everybody says this man is poor").sentiment

Sentiment(polarity=-0.4, subjectivity=0.6)

In [11]:
train = pd.read_csv("/content/Train_2.csv")
train.head(3)

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0


In [12]:
label_0 = train[train['label']==0].sample(n=5000)
label_1 = train[train['label']==1].sample(n=5000)

In [13]:
train = pd.concat([label_1,label_0])

In [14]:
from sklearn.utils import shuffle
train = shuffle(train)
train

Unnamed: 0,text,label
29217,I'm a large scarred heterosexual male ex-bounc...,1
9658,"Anyone who will pay to see Troma movies knows,...",0
9389,TESS OF THE STORM COUNTRY is possibly the best...,1
30381,I saw the film tonight at a free preview scree...,0
10245,Elizabeth Rohm was the weakest actress of all ...,0
...,...,...
18059,I'm a fan of the 1950's original and about 20 ...,0
3151,This is by far the worst movie i have ever see...,0
27601,Ed Wood is eclipsed and becomes Orson Welles. ...,0
6504,**May Contain Spoilers**<br /><br />A dude in ...,0


Data Preprocessing

In [15]:
train.isnull().sum()

Unnamed: 0,0
text,0
label,0


In [16]:
import numpy as np

In [17]:
train.replace(r'^\s*$',np.nan,regex = True,inplace = True)
train.dropna(axis=0,how = 'any',inplace=True)

In [18]:
train.replace(to_replace=[r"\\t|\\n|\\r","\t|\n|\r"],value=["",""],regex=True,inplace = True)
print("escape sequence removed")

escape sequence removed


In [19]:
train.head(4)

Unnamed: 0,text,label
29217,I'm a large scarred heterosexual male ex-bounc...,1
9658,"Anyone who will pay to see Troma movies knows,...",0
9389,TESS OF THE STORM COUNTRY is possibly the best...,1
30381,I saw the film tonight at a free preview scree...,0


In [20]:
train["text"] = train["text"].str.encode('ascii','ignore').str.decode('ascii')
print('non ascii data removed')

non ascii data removed


In [21]:
def remove_punctuations(text):
  import string
  for punctuations in string.punctuation:
    text = text.replace(punctuations,'')
  return text

train['text'] = train['text'].apply(remove_punctuations)

In [22]:
sw_list = stopwords.words('english')
sw_list.remove('no')
sw_list.remove('not')

In [23]:
def remove_stopword(text,is_lower_case= False):
  tokens = tokenizer.tokenize(text)
  tokens = [tokens.strip() for tokens in tokens]
  if is_lower_case:
    filtered_tokens = [token for token in tokens if token not in sw_list]
  else:
    filtered_tokens = [token for token in tokens if token.lower() not in sw_list]
  filtered_text = ' '.join(filtered_tokens)
  return filtered_text

In [24]:
def remove_special_characters(text):
  text = re.sub('[^a-zA-z0-9\s]','',text)
  return text

In [25]:
train['text'] = train['text'].apply(remove_special_characters)

In [26]:
def remove_html(text):
  import re
  html_pattern = re.compile('<.*?>')
  return html_pattern.sub(r' ',text)

In [27]:
train['text'] = train['text'].apply(remove_html)

In [28]:
def remove_url(text):
  url = re.compile(r'https?://\S+|www\.\S+')
  return url.sub(r' ',text)

In [29]:
train['text'] = train['text'].apply(remove_url)

In [30]:
def remove_numbers(text):
  text = ''.join([i for i in text if not i.isdigit()])
  return text

In [31]:
train['text'] = train['text'].apply(remove_numbers)

In [39]:
def cleanse(word):
    rx = re.compile(r'^\D*\d')
    if rx.match(word):
        return ''
    return word

def remove_alphanumeric(string):
    nstrings = [" ".join(filter(None, (cleanse(word) for word in string.split())))]
    str1 = ' '.join(nstrings)
    return str1

In [41]:
train['text'] = train['text'].apply(cleanse)

In [40]:
train['text'] = train['text'].apply(remove_alphanumeric)

In [42]:
def lemmatize(text):
  text = nlp(text)
  text = ' '.join([word.lemma_  if word.lemma_ != '-PRON-' else word.text for word in text])
  return text

In [43]:
train['text'] = train['text'].apply(lemmatize)

In [44]:
train['sentiment'] = train['text'].apply(lambda tweet:TextBlob(tweet).sentiment)

In [45]:
sentiment_series = train['sentiment'].tolist()

In [46]:
columns = ['polarity','subjectivity']
df1 = pd.DataFrame(sentiment_series, columns = columns ,index = train.index)

In [47]:
result = pd.concat([train,df1],axis = 1)

In [48]:
result.drop(['sentiment'],axis=1,inplace = True)

In [49]:
result.loc[result['polarity']>=0.3,'Sentiment'] = 'Positive'
result.loc[result['polarity']<0.3,'Sentiment'] = 'Negative'

In [50]:
result

Unnamed: 0,text,label,polarity,subjectivity,Sentiment
29217,I m a large scarred heterosexual male exbounce...,1,0.195157,0.542965,Negative
9658,anyone who will pay to see Troma movie know an...,0,0.111198,0.528162,Negative
9389,tess of the STORM COUNTRY be possibly the good...,1,0.183298,0.463585,Negative
30381,I see the film tonight at a free preview scree...,0,0.045000,0.596667,Negative
10245,Elizabeth Rohm be the weak actress of all the ...,0,-0.118478,0.490166,Negative
...,...,...,...,...,...
18059,I m a fan of the s original and about minute i...,0,0.090152,0.537319,Negative
3151,this be by far the bad movie I have ever see i...,0,-0.230556,0.623148,Negative
27601,Ed Wood be eclipse and become Orson Welles thi...,0,-0.171429,0.680952,Negative
6504,may contain Spoilersbr br a dude in a dopeyloo...,0,0.132062,0.478301,Negative


In [51]:
result.loc[result['label']==1,'Sentiment_label'] = 1
result.loc[result['label']==0,'Sentiment_label'] = 0

In [53]:
result.head()

Unnamed: 0,text,label,polarity,subjectivity,Sentiment,Sentiment_label
29217,I m a large scarred heterosexual male exbounce...,1,0.195157,0.542965,Negative,1.0
9658,anyone who will pay to see Troma movie know an...,0,0.111198,0.528162,Negative,0.0
9389,tess of the STORM COUNTRY be possibly the good...,1,0.183298,0.463585,Negative,1.0
30381,I see the film tonight at a free preview scree...,0,0.045,0.596667,Negative,0.0
10245,Elizabeth Rohm be the weak actress of all the ...,0,-0.118478,0.490166,Negative,0.0
