In [1]:
import nltk
from nltk.corpus import movie_reviews
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
import math
from nltk.corpus import stopwords
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import re

# 1. frequency of parts of speech
- syntax analysis: 어떤 품사들이 들어있는지
- matplotlib: 그래프로 품사들의 분표 그리기

In [None]:
categories=movie_reviews.categories()
categories

In [None]:
#text tokenizing
text=movie_reviews.raw(categories=['neg', 'pos'])
tags=nltk.word_tokenize(text)
tag_tuple=nltk.pos_tag(tags)

In [None]:
#tag
tags_list=list(map(lambda x: x[1], tag_tuple))

In [None]:
# tagset in nltk 
from nltk.data import load
tagdict = load('help/tagsets/upenn_tagset.pickle')
key=list(tagdict.keys())

In [None]:
#frequency of tags
tag_dict={}

for tag in key:
    tag_dict[tag]=tags_list.count(tag)
    
tag_dict

In [None]:
# barplot
dictionary = plt.figure(figsize=(20,10))

plt.bar(range(len(tag_dict)), tag_dict.values(), align='center')
plt.xticks(range(len(tag_dict)), tag_dict.keys())
plt.ylabel('frequency')
plt.title('frequency of parts of speech')

# 2. TF-IDF
- count_word: the number of a target word in a document
- count: the number of all words in a document
- tf=math.log(1+(count_word/count))
- idt=1/num
- num: the number of documents containing the target word
- TF-IDF=idt*tf

- TF-IDF 계산을 통해 positive, negative 그룹의 상위 15개 words 찾기

In [2]:
#documents by group
pos_doc=movie_reviews.fileids(categories='pos')
neg_doc=movie_reviews.fileids(categories='neg')

In [None]:
pos_doc

In [3]:
#text tokenizing by text files
pos_dict = {}
for idx in range(len(pos_doc)):
    reg = re.compile(r"[a-zA-Z]+", re.I)
    pos_dict[pos_doc[idx]] = list(filter(reg.match, movie_reviews.words(fileids=pos_doc[idx])))

neg_dict = {}
for idx in range(len(neg_doc)):
    reg = re.compile(r"[a-zA-Z]+", re.I)
    neg_dict[neg_doc[idx]] = list(filter(reg.match, movie_reviews.words(fileids=neg_doc[idx])))

In [None]:
pos_dict['pos/cv258_5792.txt']

In [4]:
#stopwords
english_stops=stopwords.words('english')
#if word not in english_stops

#stopwords elimination
pos_words=pos_dict.copy()
neg_words=neg_dict.copy()
for idx in pos_doc:
    pos_words[idx]=[word for word in pos_dict[idx] if word not in english_stops]

for idx in neg_doc:
    neg_words[idx]=[word for word in neg_dict[idx] if word not in english_stops]

In [5]:
#lemmatization
lemmatizer=WordNetLemmatizer()
lemmatized_pos=pos_words.copy()
lemmatized_neg=neg_words.copy()

for idx in pos_doc:
    lemmatized_pos[idx]=list(map(lambda x:lemmatizer.lemmatize(x), pos_words[idx]))

for idx in neg_doc:
    lemmatized_neg[idx]=list(map(lambda x:lemmatizer.lemmatize(x), neg_words[idx]))

In [6]:
#positive group 
cfd=nltk.ConditionalFreqDist((d, w) for d in pos_doc for w in lemmatized_pos[d])

#negative group 
cfd1=nltk.ConditionalFreqDist((d, w) for d in neg_doc for w in lemmatized_neg[d])

In [7]:
pos_df=pd.DataFrame(cfd).fillna(0)
neg_df=pd.DataFrame(cfd1).fillna(0)

In [None]:
pos_df

# pos_df -> pos_tf

In [8]:
pos_temp=pos_df.copy()

In [9]:
import math 
sum_for_each_text=pos_temp.sum(axis=0)
pos_tf= ((pos_temp/sum_for_each_text)+1)
pos_tf_df = pos_tf.applymap(lambda x : math.log(x))

In [10]:
pos_tf_df['pos/cv005_29443.txt'].max()

0.025769513179051611

# neg_df -> neg_tf

In [11]:
neg_temp = neg_df.copy()

In [12]:
import math 
sum_for_each_text=neg_temp.sum(axis=0)
neg_tf= ((neg_temp/sum_for_each_text)+1)
neg_tf_df = neg_tf.applymap(lambda x : math.log(x))

In [13]:
neg_tf_df["neg/cv000_29416.txt"].max()

0.024391453124159267

In [14]:
neg_tf_df

Unnamed: 0,neg/cv000_29416.txt,neg/cv001_19502.txt,neg/cv002_17424.txt,neg/cv003_12683.txt,neg/cv004_12641.txt,neg/cv005_29357.txt,neg/cv006_17022.txt,neg/cv007_4992.txt,neg/cv008_29326.txt,neg/cv009_29417.txt,...,neg/cv990_12443.txt,neg/cv991_19973.txt,neg/cv992_12806.txt,neg/cv993_29565.txt,neg/cv994_13229.txt,neg/cv995_23113.txt,neg/cv996_12447.txt,neg/cv997_5152.txt,neg/cv998_15691.txt,neg/cv999_14636.txt
aa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaaaaaaaah,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaaaaaaahhhh,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaaaaah,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaliyah,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aalyah,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaron,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aatish,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ab,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Now let's get into idf

In [15]:
pos_idf =pd.DataFrame(pos_df[pos_df !=0].count(axis = 1))
pos_idf = pos_idf.applymap(lambda x: math.log(1000/x))
pos_idf

Unnamed: 0,0
aaaahhhs,6.907755
aahs,6.907755
aamir,6.907755
aardman,6.214608
aaron,4.710531
abandon,3.863233
abandoned,3.649659
abandoning,6.907755
abandonment,6.214608
abba,6.907755


In [16]:
neg_idf =pd.DataFrame(neg_df[neg_df !=0].count(axis = 1))
neg_idf = neg_idf.applymap(lambda x: math.log(1000/x))
neg_idf

Unnamed: 0,0
aa,6.214608
aaa,6.214608
aaaaaaaaah,6.907755
aaaaaaaahhhh,6.907755
aaaaaah,6.907755
aaliyah,5.809143
aalyah,6.907755
aaron,5.115996
aatish,6.907755
ab,5.809143


# tf-idf calculation

# pos_tf_idf

In [17]:
pos_tf_df

Unnamed: 0,pos/cv000_29590.txt,pos/cv001_18431.txt,pos/cv002_15918.txt,pos/cv003_11664.txt,pos/cv004_11636.txt,pos/cv005_29443.txt,pos/cv006_15448.txt,pos/cv007_4968.txt,pos/cv008_29435.txt,pos/cv009_29592.txt,...,pos/cv990_11591.txt,pos/cv991_18645.txt,pos/cv992_11962.txt,pos/cv993_29737.txt,pos/cv994_12270.txt,pos/cv995_21821.txt,pos/cv996_11592.txt,pos/cv997_5046.txt,pos/cv998_14111.txt,pos/cv999_13106.txt
aaaahhhs,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0
aahs,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0
aamir,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0
aardman,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0
aaron,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.02353,0.000000,0.0,0.0
abandon,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0
abandoned,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0
abandoning,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0
abandonment,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0
abba,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0


In [19]:
pos_idf_series = pos_idf[0]

In [20]:
pos_tf_idf = pos_tf_df.mul(pos_idf_series, axis = 0)

In [22]:
pos_tf_idf["pos/cv000_29590.txt"].max()

0.047259604924639737

In [29]:
neg_idf_series = neg_idf[0]

In [32]:
neg_tf_idf = neg_tf_df.mul(neg_idf_series, axis = 0)

# Positive word ranking

In [26]:
pos_result_df=pos_tf_idf.mean(axis=1)
pos_result_df.sort_values(ascending=False)[:15]

alien     0.002227
life      0.002111
movie     0.002101
love      0.001996
family    0.001991
comedy    0.001967
war       0.001952
story     0.001946
action    0.001939
star      0.001847
great     0.001831
scene     0.001819
good      0.001792
man       0.001761
u         0.001758
dtype: float64

# Negative word ranking

In [33]:
neg_result_df=neg_tf_idf.mean(axis=1)
neg_result_df.sort_values(ascending=False)[:15]

vampire            0.004956
murphy             0.004514
harry              0.004394
extraordinarily    0.004296
horrendous         0.004139
godzilla           0.004138
batman             0.003983
mar                0.003926
ape                0.003821
carrey             0.003769
spawn              0.003754
alien              0.003751
carpenter          0.003711
joe                0.003707
pokemon            0.003634
dtype: float64