### IMDB dataset Words

This notebook will help to explore the data set `imdb`: in particular it will try to sort the words in reviews according to their importance in the two opinion-groups.

In [1]:
%load_ext autoreload
%autoreload 2


from sentimental_hwglu.naive_sa import NaiveSA
from sentimental_hwglu.utils import *
from sentimental_hwglu.words_statistics import WordStatistics
from sklearn.feature_extraction.text import CountVectorizer

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import time
import re


In [2]:
data_directory = None

#### Load Data

In [3]:
if data_directory is None:
    print("Project directory: ")
    data_directory = input()

Project directory: 


In [4]:
project = Project(data_directory)
df = loadIMDBdataset(filename=project.csv_filename_extened)
n_reviews = len(df.reviews)

In [5]:
ws = WordStatistics(df, 'stamm')
ws.createWordsSets()

 running set_words  1 / 2
 tokenization for review  [################### ] 96.0%
 tokenization took  1.777862310409546  sec.
 running set_words  2 / 2
 tokenization for review  [################### ] 96.0%
 tokenization took  1.6631364822387695  sec.
 create common words
 function executed in 0.2087s
 create only negative words
 function executed in 0.0393s
 create only positive words
 function executed in 0.0411s
 function executed in 3.7632s


In [9]:

dir = project.basedir + '/datasets/words/'
df_w_common = pd.read_csv(dir + '/frequencies_words_common.csv', sep=';', index_col=False)
df_w_pos = pd.read_csv(dir + '/frequencies_words_only_pos.csv', sep=';', index_col=False)
df_w_neg = pd.read_csv(dir + '/frequencies_words_only_neg.csv', sep=';', index_col=False)
df_w_common_stamm = pd.read_csv(dir + '/frequencies_words_stamm_common.csv', sep=';', index_col=False)
df_w_pos_stamm = pd.read_csv(dir + '/frequencies_words_stamm_only_pos.csv', sep=';', index_col=False)
df_w_neg_stamm = pd.read_csv(dir + '/frequencies_words_stamm_only_neg.csv', sep=';', index_col=False)

df = pd.DataFrame()
df['nur pos. word'] = df_w_pos_stamm.sort_values('count_use', ascending=False, ignore_index=True).head(n=5).word
df['count'] = df_w_pos_stamm.sort_values('count_use', ascending=False, ignore_index=True).head(n=5).count_use
df['nur neg. word'] = df_w_neg_stamm.sort_values('count_use', ascending=False, ignore_index=True).head(n=5).word
df['count '] = df_w_neg_stamm.sort_values('count_use', ascending=False, ignore_index=True).head(n=5).count_use

In [10]:
print("********************************")
print("common:    ", len(df_w_common))
print("positive:  ", len(df_w_pos))
print("negative:  ", len(df_w_neg))
print("     + ------------------------")
print("          ", len(df_w_pos) + len(df_w_neg) + len(df_w_common))
print()
print("********************************")
print("common:    ", len(df_w_common_stamm))
print("positive:  ", len(df_w_pos_stamm))
print("negative:  ", len(df_w_neg_stamm))
print("     + ------------------------")
print("          ", len(df_w_pos_stamm) + len(df_w_neg_stamm) + len(df_w_common_stamm))

********************************
common:     54273
positive:   51309
negative:   49249
     + ------------------------
           154831

********************************
common:     38317
positive:   42959
negative:   41486
     + ------------------------
           122762


In [10]:
df_tot = pd.DataFrame()
df_tot_stamm = pd.DataFrame()

In [11]:
df_tot = pd.concat([df_w_common, df_w_neg, df_w_pos])
df_tot_stamm = pd.concat([df_w_common_stamm, df_w_neg_stamm, df_w_pos_stamm])

In [12]:
print("words:        ", len(df_tot.word.unique()))
print("words(stamm): ", len(df_tot_stamm.word.unique()))

words:         154830
words(stamm):  122761


In [13]:
df_w_common_stamm.head()

Unnamed: 0,word,count_use,count_positive,count_negative,fraction_positive,fraction_negative
0,estim,62,30,32,0.483871,0.516129
1,urban,388,275,113,0.708763,0.291237
2,hawk',30,27,3,0.9,0.1
3,saiyan,5,1,4,0.2,0.8
4,knit,46,20,26,0.434783,0.565217


In [11]:
print(df_w_common_stamm.sort_values('count_use', ascending=False).head(n=1000).sort_values('fraction_positive', ascending=False).head(10).to_latex(index=False, columns=['word', 'count_use', 'count_positive', 'fraction_positive']))
print(df_w_common_stamm.sort_values('count_use', ascending=False).head(n=1000).sort_values('fraction_negative', ascending=False).head(10).to_latex(index=False, columns=['word', 'count_use', 'count_negative', 'fraction_negative']))
print(df.to_latex(index=False))

\begin{tabular}{lrrr}
\toprule
     word &  count\_use &  count\_positive &  fraction\_positive \\
\midrule
   superb &       1291 &            1108 &           0.858249 \\
  delight &       1057 &             872 &           0.824976 \\
    excel &       4424 &            3624 &           0.819168 \\
  fantast &       1623 &            1302 &           0.802218 \\
  perfect &       3440 &            2686 &           0.780814 \\
brilliant &       2395 &            1864 &           0.778288 \\
  favorit &       2763 &            2145 &           0.776330 \\
perfectli &       1283 &             984 &           0.766952 \\
    uniqu &       1330 &            1008 &           0.757895 \\
     amaz &       3000 &            2261 &           0.753667 \\
\bottomrule
\end{tabular}

\begin{tabular}{lrrr}
\toprule
   word &  count\_use &  count\_negative &  fraction\_negative \\
\midrule
  worst &       5302 &            4858 &           0.916258 \\
   wast &       4360 &            3986 &      

In [21]:
df_w_common_stamm.query("word == 'superb'")['fraction_positive'].values[0]

0.8582494190549961

In [28]:
df_w_common_stamm.sort_values('count_use', ascending=False).head(n=1000).sort_values('fraction_positive', ascending=False).query("fraction_positive > 0.7")['word'].values

array(['superb', 'delight', 'excel', 'fantast', 'perfect', 'brilliant',
       'favorit', 'perfectli', 'uniqu', 'amaz', 'highli', 'today',
       'greatest', 'beauti', 'complex', 'touch', 'sweet', 'oscar',
       'intens', 'masterpiec', 'strong', 'great', 'season', 'fascin'],
      dtype=object)

In [None]:
print(df_w_neg_stamm.sort_values('count_use', ascending=False).head(n=5).to_latex())

\begin{tabular}{llrrrrr}
\toprule
{} &       word &  count\_use &  count\_positive &  count\_negative &   fraction\_positive &   fraction\_negative \\
\midrule
33204 &  carnosaur &         67 &               0 &              67 &                 0.0 &                 1.0 \\
35930 &     komodo &         63 &               0 &              63 &                 0.0 &                 1.0 \\
1455  &    piranha &         59 &               0 &              59 &                 0.0 &                 1.0 \\
3617  &     tashan &         44 &               0 &              44 &                 0.0 &                 1.0 \\
28135 &  cornfield &         42 &               0 &              42 &                 0.0 &                 1.0 \\
\bottomrule
\end{tabular}

