In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from textblob import TextBlob
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer 

In [2]:
test = "Hello, how are you ? The weather is great, and Python is awesome."

print(sent_tokenize(test))

['Hello, how are you ?', 'The weather is great, and Python is awesome.']


In [4]:
test2=" I am a Programmer, i have also skills of Graphics Designing"
print(word_tokenize(test2))

['I', 'am', 'a', 'Programmer', ',', 'i', 'have', 'also', 'skills', 'of', 'Graphics', 'Designing']


In [5]:
text = ''' The titular threat of The Blob has always struck me as the ultimate movie
monster: an insatiably hungry, amoeba-like mass able to penetrate
virtually any safeguard, capable of--as a doomed doctor chillingly
describes it--"assimilating flesh on contact.
Snide comparisons to gelatin be damned, it's a concept with the most
devastating of potential consequences, not unlike the grey goo scenario
proposed by technological theorists fearful of
artificial intelligence run rampant
       '''

test2 = TextBlob(text)

for i in test2.sentences:
    print(i.sentiment.polarity)

0.06000000000000001
-0.34166666666666673


In [6]:
text = ''' When you are set to begin with your tasks, deal with the hardest one 
       at first 
       '''

test2 = TextBlob(text)

for i in test2.sentences:
    print(i.sentiment.polarity)

0.25


In [7]:
df= pd.read_csv('clean_data.csv')

In [8]:
df.head()

Unnamed: 0,original,Document
0,"In the following excerpt, originally published...",following excerpt originally published italian...
1,by Umberto Eco,umberto eco
2,In 1953 Ian Fleming published the first novel ...,1953 ian fleming published first novel 007 ser...
3,"To Spillane, Casino Royale owes, beyond doubt,...",spillane casino royale owes beyond doubt least...
4,In the second place Bond is obsessed by an ima...,second place bond obsessed image japanese expe...


In [9]:
TextBlob(df["Document"].iloc[0]).sentiment

Sentiment(polarity=0.215, subjectivity=0.32)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfconverter = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7,ngram_range=(1,3) ,stop_words=stopwords.words('english'))
X = tfidfconverter.fit_transform(df["Document"]).toarray()

In [11]:
tfidfconverter.get_feature_names()



['007',
 'absolutely',
 'act',
 'action',
 'almost',
 'already',
 'also',
 'always',
 'ambiguous',
 'among',
 'anglo',
 'anglo saxon',
 'another',
 'appearance',
 'appears',
 'author',
 'beginning',
 'big',
 'black',
 'blofeld',
 'blood',
 'body',
 'bond',
 'book',
 'call',
 'case',
 'casino',
 'casino royale',
 'century',
 'certain',
 'chance',
 'character',
 'characteristic',
 'chiffre',
 'choice',
 'cold',
 'communist',
 'conflict',
 'could',
 'country',
 'couple',
 'course',
 'cruel',
 'de',
 'death',
 'describe',
 'description',
 'detail',
 'diamond',
 'different',
 'doe',
 'dominated',
 'dr',
 'drax',
 'ear',
 'either',
 'element',
 'elementary',
 'end',
 'enemy',
 'english',
 'erotic',
 'escape',
 'even',
 'every',
 'evil',
 'example',
 'except',
 'excess',
 'eye',
 'eyelash',
 'face',
 'fact',
 'figure',
 'final',
 'first',
 'five',
 'fleming',
 'following',
 'forever',
 'form',
 'four',
 'free',
 'free world',
 'function',
 'game',
 'general',
 'german',
 'girl',
 'give',
 'go

In [12]:
len(tfidfconverter.get_feature_names())

266

In [13]:
df['Document'][:]

0      following excerpt originally published italian...
1                                            umberto eco
2      1953 ian fleming published first novel 007 ser...
3      spillane casino royale owes beyond doubt least...
4      second place bond obsessed image japanese expe...
                             ...                        
119    hand woman fleming cannot accept decadent arch...
120    however concerned psychological interpretation...
121    message doe really end except concrete local r...
122    source eco umberto narrative structure fleming...
123             twentieth century literary criticism 193
Name: Document, Length: 124, dtype: object

In [14]:
fdist = FreqDist()

In [15]:
for x in str(df['Document'][:]).split():
    fdist[x]+=1
    
len(fdist)
    

72

In [16]:
fdist.most_common(50)

[('published', 2),
 ('umberto', 2),
 ('eco', 2),
 ('fleming', 2),
 ('0', 1),
 ('following', 1),
 ('excerpt', 1),
 ('originally', 1),
 ('italian...', 1),
 ('1', 1),
 ('2', 1),
 ('1953', 1),
 ('ian', 1),
 ('first', 1),
 ('novel', 1),
 ('007', 1),
 ('ser...', 1),
 ('3', 1),
 ('spillane', 1),
 ('casino', 1),
 ('royale', 1),
 ('owes', 1),
 ('beyond', 1),
 ('doubt', 1),
 ('least...', 1),
 ('4', 1),
 ('second', 1),
 ('place', 1),
 ('bond', 1),
 ('obsessed', 1),
 ('image', 1),
 ('japanese', 1),
 ('expe...', 1),
 ('...', 1),
 ('119', 1),
 ('hand', 1),
 ('woman', 1),
 ('cannot', 1),
 ('accept', 1),
 ('decadent', 1),
 ('arch...', 1),
 ('120', 1),
 ('however', 1),
 ('concerned', 1),
 ('psychological', 1),
 ('interpretation...', 1),
 ('121', 1),
 ('message', 1),
 ('doe', 1),
 ('really', 1)]

In [17]:
ser= pd.Series(tfidfconverter.get_feature_names())
ser.count()

266

In [19]:
df['polarity']= df["Document"].apply(lambda x : TextBlob(x).sentiment[0])
df['Subjectivity']= df["Document"].apply(lambda x : TextBlob(x).sentiment[1])

In [44]:
df.head()

Unnamed: 0,original,Document,polarity,Subjectivity
0,"In the following excerpt, originally published...",following excerpt originally published italian...,0.215,0.32
1,by Umberto Eco,umberto eco,0.0,0.0
2,In 1953 Ian Fleming published the first novel ...,1953 ian fleming published first novel 007 ser...,-0.103704,0.483333
3,"To Spillane, Casino Royale owes, beyond doubt,...",spillane casino royale owes beyond doubt least...,0.101852,0.455556
4,In the second place Bond is obsessed by an ima...,second place bond obsessed image japanese expe...,-0.040132,0.318933


In [45]:
df['Lenght']= df["Document"].apply(lambda x: len(x.split()))

In [46]:
df.head()

Unnamed: 0,original,Document,polarity,Subjectivity,Lenght
0,"In the following excerpt, originally published...",following excerpt originally published italian...,0.215,0.32,23
1,by Umberto Eco,umberto eco,0.0,0.0,2
2,In 1953 Ian Fleming published the first novel ...,1953 ian fleming published first novel 007 ser...,-0.103704,0.483333,30
3,"To Spillane, Casino Royale owes, beyond doubt,...",spillane casino royale owes beyond doubt least...,0.101852,0.455556,51
4,In the second place Bond is obsessed by an ima...,second place bond obsessed image japanese expe...,-0.040132,0.318933,99


In [49]:
x=len(df)
neg=[]
pos=[]
neu=[]
comp=[]
for i in range(0,x):
    def sa(i):
        score = SentimentIntensityAnalyzer().polarity_scores(i)
        v1 = score['neg']
        positive = score['pos']
        neutral = score['neu']
        compound = score['comp']
        neg.append(v1)
        pos.append(positive)
        neu.append(neutral)
        comp.append(compound)
        
    sa(df["Document"][i])

In [50]:
df["Negative Sentiment"]=neg
df["Positive Sentiment"]=pos
df["Neutral Sentiment"]=neu
df["Compound Sentiment"]=comp

In [23]:
df.head(25)

Unnamed: 0,original,Document,polarity,Subjectivity,Lenght,Negative Sentiment,Positive Sentiment,Neutral Sentiment,Compound Sentiment
0,"In the following excerpt, originally published...",following excerpt originally published italian...,0.215,0.32,23,0.0,0.222,0.778,0.7184
1,by Umberto Eco,umberto eco,0.0,0.0,2,0.0,0.0,1.0,0.0
2,In 1953 Ian Fleming published the first novel ...,1953 ian fleming published first novel 007 ser...,-0.103704,0.483333,30,0.237,0.131,0.631,-0.6369
3,"To Spillane, Casino Royale owes, beyond doubt,...",spillane casino royale owes beyond doubt least...,0.101852,0.455556,51,0.311,0.335,0.354,0.0772
4,In the second place Bond is obsessed by an ima...,second place bond obsessed image japanese expe...,-0.040132,0.318933,99,0.246,0.107,0.648,-0.9485
5,After having helped to dispose of two Bulgaria...,helped dispose two bulgarian tried get rid suf...,0.007692,0.461538,151,0.273,0.232,0.495,-0.8841
6,With this lapidary phrase Fleming defines the ...,lapidary phrase fleming defines character jame...,-0.005,0.433333,101,0.178,0.249,0.573,0.6959
7,From the psychological point of view a convers...,psychological point view conversion ha taken p...,0.029663,0.353968,53,0.0,0.096,0.904,0.659
8,In Casino Royale there are already all the ele...,casino royale already element building machine...,0.210714,0.304592,74,0.022,0.247,0.731,0.9654
9,The juxtaposition of the characters and of val...,juxtaposition character value,0.0,0.0,3,0.0,0.545,0.455,0.34


In [24]:
df.loc[df['polarity'] >= 0.001, 'Sentiment p/n'] = 1

df.loc[df['polarity'] <= -0.001, 'Sentiment p/n'] = -1

df.loc[df['polarity'] == 0.0, 'Sentiment p/n'] = 0

df['Sentiment p/n']

0      1.0
1      0.0
2     -1.0
3      1.0
4     -1.0
      ... 
119    1.0
120    1.0
121    1.0
122    0.0
123    1.0
Name: Sentiment p/n, Length: 124, dtype: float64

In [25]:
df

Unnamed: 0,original,Document,polarity,Subjectivity,Lenght,Negative Sentiment,Positive Sentiment,Neutral Sentiment,Compound Sentiment,Sentiment p/n
0,"In the following excerpt, originally published...",following excerpt originally published italian...,0.215000,0.320000,23,0.000,0.222,0.778,0.7184,1.0
1,by Umberto Eco,umberto eco,0.000000,0.000000,2,0.000,0.000,1.000,0.0000,0.0
2,In 1953 Ian Fleming published the first novel ...,1953 ian fleming published first novel 007 ser...,-0.103704,0.483333,30,0.237,0.131,0.631,-0.6369,-1.0
3,"To Spillane, Casino Royale owes, beyond doubt,...",spillane casino royale owes beyond doubt least...,0.101852,0.455556,51,0.311,0.335,0.354,0.0772,1.0
4,In the second place Bond is obsessed by an ima...,second place bond obsessed image japanese expe...,-0.040132,0.318933,99,0.246,0.107,0.648,-0.9485,-1.0
...,...,...,...,...,...,...,...,...,...,...
119,"On the other hand, for woman Fleming cannot ac...",hand woman fleming cannot accept decadent arch...,0.006250,0.400000,20,0.176,0.196,0.628,0.1321,1.0
120,"However, we are not here concerned with a psyc...",however concerned psychological interpretation...,0.187010,0.593382,82,0.117,0.255,0.628,0.9289,1.0
121,"And, again, a message does not really end exce...",message doe really end except concrete local r...,0.087500,0.141667,24,0.091,0.000,0.909,-0.3182,1.0
122,"SOURCE: Eco, Umberto. “The Narrative Structure...",source eco umberto narrative structure fleming...,0.000000,0.000000,22,0.000,0.000,1.000,0.0000,0.0


In [26]:
df.to_csv('sentiment_feature_extraction.csv', index=False)

In [27]:
pd.read_csv('sentiment_feature_extraction.csv')

Unnamed: 0,original,Document,polarity,Subjectivity,Lenght,Negative Sentiment,Positive Sentiment,Neutral Sentiment,Compound Sentiment,Sentiment p/n
0,"In the following excerpt, originally published...",following excerpt originally published italian...,0.215000,0.320000,23,0.000,0.222,0.778,0.7184,1.0
1,by Umberto Eco,umberto eco,0.000000,0.000000,2,0.000,0.000,1.000,0.0000,0.0
2,In 1953 Ian Fleming published the first novel ...,1953 ian fleming published first novel 007 ser...,-0.103704,0.483333,30,0.237,0.131,0.631,-0.6369,-1.0
3,"To Spillane, Casino Royale owes, beyond doubt,...",spillane casino royale owes beyond doubt least...,0.101852,0.455556,51,0.311,0.335,0.354,0.0772,1.0
4,In the second place Bond is obsessed by an ima...,second place bond obsessed image japanese expe...,-0.040132,0.318933,99,0.246,0.107,0.648,-0.9485,-1.0
...,...,...,...,...,...,...,...,...,...,...
119,"On the other hand, for woman Fleming cannot ac...",hand woman fleming cannot accept decadent arch...,0.006250,0.400000,20,0.176,0.196,0.628,0.1321,1.0
120,"However, we are not here concerned with a psyc...",however concerned psychological interpretation...,0.187010,0.593382,82,0.117,0.255,0.628,0.9289,1.0
121,"And, again, a message does not really end exce...",message doe really end except concrete local r...,0.087500,0.141667,24,0.091,0.000,0.909,-0.3182,1.0
122,"SOURCE: Eco, Umberto. “The Narrative Structure...",source eco umberto narrative structure fleming...,0.000000,0.000000,22,0.000,0.000,1.000,0.0000,0.0
