In [1]:
#!/usr/bin/python3
from sklearn.svm import OneClassSVM
import pandas as pd
import feather
from sklearn.model_selection import train_test_split

import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
np.random.seed(500)

In [2]:
file_path = '/home/parallels/KTH/II2202/Data/YelpNYC/database_feather.feather'
#data_frame = feather.read_dataframe(file_path)
df = pd.read_feather(file_path, columns=None, use_threads=True)

In [3]:
df.dtypes

user_id             int32
prod_id             int32
date               object
rating            float64
label               int32
review_content     object
word_count          int32
dtype: object

In [4]:
print(df)

        user_id  prod_id        date  rating  label  \
0           923        0  2014-12-08     3.0     -1   
1           923       19  2014-01-14     5.0     -1   
2           923       63  2014-11-13     4.0     -1   
3           923      100  2014-03-05     4.0     -1   
4           923      103  2014-07-28     5.0     -1   
...         ...      ...         ...     ...    ...   
181350   161143      349  2014-02-19     5.0      1   
181351   161144      349  2014-02-11     5.0      1   
181352   161145      349  2014-02-09     5.0      1   
181353   161146      349  2014-02-06     5.0      1   
181354   161147      349  2014-01-30     5.0      1   

                                           review_content  word_count  
0       The food at snack is a selection of popular Gr...          42  
1       The restaurant is on the ground floor of a typ...          95  
2       I really enjoyed brunch at Jane. The ambiance ...          35  
3       Clinton street baking is a super popular ba

In [5]:
print(df.describe())

             user_id        prod_id         rating          label  \
count  181355.000000  181355.000000  181355.000000  181355.000000   
mean    53637.327380     454.388867       4.025205       0.788547   
std     45732.857076     260.687294       1.057903       0.614976   
min       923.000000       0.000000       1.000000      -1.000000   
25%     13917.000000     245.000000       4.000000       1.000000   
50%     39608.000000     465.000000       4.000000       1.000000   
75%     86406.000000     668.000000       5.000000       1.000000   
max    161147.000000     922.000000       5.000000       1.000000   

          word_count  
count  181355.000000  
mean      240.641273  
std      2992.140353  
min         1.000000  
25%        46.000000  
50%        89.000000  
75%       158.000000  
max    212038.000000  


In [6]:
df.iloc[390,:]

user_id                                                         994
prod_id                                                         251
date                                                     2012-06-01
rating                                                            4
label                                                             1
review_content    Since I am a pescatarian and pregnant (pregnan...
word_count                                                      179
Name: 390, dtype: object

In [7]:
genuine = df.loc[df['label'] == 1]
genuine = genuine[['review_content','word_count']]

In [8]:
array = genuine.values

X = array[:,0]
Y = array[:,1]
Y=Y.astype(str)
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=0.9999, random_state=1)

In [9]:
len(X_train)

16

In [10]:
# Step - a : Remove blank rows if any.
genuine['review_content'].dropna(inplace=True)

In [11]:
# Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
genuine['review_content'] = [entry.lower() for entry in genuine['review_content']]

In [12]:
word_tokenize(genuine.iloc[0,0])

['tiny',
 'little',
 'place',
 ',',
 'but',
 'very',
 'good',
 'food',
 '.',
 'pastitsio',
 'was',
 'especially',
 'good',
 '.']

In [13]:
# Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
# tokenized = []
# for index,entry in enumerate(genuine['review_content']): 
#     genuine.loc[index,'tokenized'] = [word_tokenize(entry)]
#     percent = index/len(genuine)*100
#     print('percent done [%d%%]\r'%percent, end="")
    
    
genuine['tokenized']= [word_tokenize(entry) for entry in genuine['review_content']]

In [14]:
genuine

Unnamed: 0,review_content,word_count,tokenized
47,"tiny little place, but very good food. pastits...",11,"[tiny, little, place, ,, but, very, good, food..."
48,i had the braised lamb sandwich and was one of...,29,"[i, had, the, braised, lamb, sandwich, and, wa..."
49,this spot is close to my job so i decided to c...,181,"[this, spot, is, close, to, my, job, so, i, de..."
50,"for lunch, my friend and i had: -lamb sandwich...",82,"[for, lunch, ,, my, friend, and, i, had, :, -l..."
51,came here for a friend's birthday. the food wa...,156,"[came, here, for, a, friend, 's, birthday, ., ..."
...,...,...,...
181350,get the fried brussel sprouts. get the emily p...,55,"[get, the, fried, brussel, sprouts, ., get, th..."
181351,the food was perfect. the wine was perfect. th...,78,"[the, food, was, perfect, ., the, wine, was, p..."
181352,came here for sunday brunch. everything we tas...,45,"[came, here, for, sunday, brunch, ., everythin..."
181353,"i'm very spoiled with pizza. really, i have tr...",280,"[i, 'm, very, spoiled, with, pizza, ., really,..."


In [15]:
# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
final_text = [None]*len(genuine['tokenized'])
for index,entry in enumerate(genuine['tokenized']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    percent = float(index/len(genuine['tokenized'])*100)
    #print('index, entry [%f%%][%s%%]\r'%index %entry, end="")
    print('percent done [%f%%]\r'%percent, end="")
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    genuine.loc[index,'text_final'] = str(Final_words)
    final_text[index]=(Final_words)
    
    #print('index ' + str(index) + ' entry ' + str(entry) +  '...final//// '+ str(genuine.loc[index,'text_final']) + '\r')

percent done [91.391669%]

In [16]:
genuine

Unnamed: 0,review_content,word_count,tokenized,text_final
47,"tiny little place, but very good food. pastits...",11.0,"[tiny, little, place, ,, but, very, good, food...","['cute', 'bar', 'absolutely', 'adore', 'decor'..."
48,i had the braised lamb sandwich and was one of...,29.0,"[i, had, the, braised, lamb, sandwich, and, wa...","['sometimes', 'take', 'get', 'bandwagon', 'sou..."
49,this spot is close to my job so i decided to c...,181.0,"[this, spot, is, close, to, my, job, so, i, de...","['make', 'mean', 'cocktail', 'north', 'end', '..."
50,"for lunch, my friend and i had: -lamb sandwich...",82.0,"[for, lunch, ,, my, friend, and, i, had, :, -l...","['mark', 'challenge', 'continue', 'embark', 'u..."
51,came here for a friend's birthday. the food wa...,156.0,"[came, here, for, a, friend, 's, birthday, ., ...","['like', 'make', 'since', 'appetizer', 'near',..."
...,...,...,...,...
162059,,,,"['yummy', 'lunch', 'special', 'inexpensive', '..."
162060,,,,"['food', 'pretty', 'good', 'expensive', 'porti..."
162061,,,,"['place', 'review', 'base', 'last', 'visit', '..."
162062,,,,"['place', 'great', 'service', 'trendy', 'ambia..."


In [17]:
genuine.loc[0]

review_content                                                  NaN
word_count                                                      NaN
tokenized                                                       NaN
text_final        ['tiny', 'little', 'place', 'good', 'food', 'p...
Name: 0, dtype: object

In [36]:
len(final_text)

162181

In [37]:
final_text_trial = []
for idx, text in enumerate(final_text):
    final_text_trial.append(str(text))

In [38]:
final_text_trial

["['tiny', 'little', 'place', 'good', 'food', 'pastitsio', 'especially', 'good']",
 "['braise', 'lamb', 'sandwich', 'one', 'best', 'sandwich', 'life', 'favour', 'try', 'place', 'friendly', 'service', 'cosy', 'atmosphere']",
 "['spot', 'close', 'job', 'decide', 'check', 'lunch', 'price', 'would', 'expect', 'village', 'lunch', 'price', 'meal', 'come', 'mint', 'lemonade', 'salad', 'order', 'think', 'may', 'special', 'roast', 'chicken', 'vegetable', 'salad', 'chicken', 'unbelievably', 'dry', 'point', 'tough', 'time', 'chew', 'without', 'choke', 'bit', 'serve', 'bed', 'arugula', 'could', 'chop', 'little', 'finer', 'full', 'hard', 'eat', 'piece', 'eat', 'salad', 'expect', 'cut', 'make', 'easy', 'bite', 'necessarily', 'chopped', 'salad', 'piece', 'actually', 'fit', 'mouth', 'without', 'chop', 'basically', 'want', 'use', 'knife', 'eat', 'salad', 'however', 'service', 'friendly', 'quick', 'restaurant', 'comfortable', 'albeit', 'tiny', 'would', 'go', 'back', 'try', 'something', 'else', 'probably

In [27]:
genuine['text_final']

47        ['cute', 'bar', 'absolutely', 'adore', 'decor'...
48        ['sometimes', 'take', 'get', 'bandwagon', 'sou...
49        ['make', 'mean', 'cocktail', 'north', 'end', '...
50        ['mark', 'challenge', 'continue', 'embark', 'u...
51        ['like', 'make', 'since', 'appetizer', 'near',...
                                ...                        
162059    ['yummy', 'lunch', 'special', 'inexpensive', '...
162060    ['food', 'pretty', 'good', 'expensive', 'porti...
162061    ['place', 'review', 'base', 'last', 'visit', '...
162062    ['place', 'great', 'service', 'trendy', 'ambia...
162063    ['food', 'good', 'excellent', 'worth', 'revisi...
Name: text_final, Length: 177456, dtype: object

In [42]:
Train_X, Test_X = model_selection.train_test_split(final_text_trial,test_size=0.3)

In [43]:
Tfidf_vect = TfidfVectorizer(max_features=50000,lowercase = False)        
Tfidf_vect.fit(final_text_trial)

Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [41]:
print(Tfidf_vect.vocabulary_)



In [None]:
# fit the training dataset on the classifier
clf = OneClassSVM(gamma='auto',verbose=True).fit(Train_X_Tfidf)
# predict the labels on validation dataset
predictions_oneclass = clf.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
#print("Accuracy Score -> ",accuracy_score(predictions_oneclass, Test_X_Tfidf)*100)



[LibSVM]