In [9]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.neural_network import MLPClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder
import spacy

In [10]:
spacy.load('en')
lemmatizer = spacy.lang.en.English()
def my_tokenizer(doc):
    tokens = lemmatizer(doc)
    return([token.lemma_ for token in tokens])

In [11]:
path2files="/Users/ashishkumar/cs598dmcs/dataset/Hygiene/"
datFile = path2files+"hygiene.dat"
labelFile = path2files+"hygiene.dat.labels"
addDataFile = path2files + "hygiene.dat.additional"

df1 = pd.read_csv(addDataFile,header=None)
df1.columns = ['cuisine','zipCode','reviewCount','avgRating']
df2 = pd.read_csv(datFile, sep='\n',header=None)
#only first 546 restaurants are labeled
dataLabels = pd.read_csv(labelFile, sep='\n',header=None)
labeledData = 546
lbl = dataLabels[0:labeledData][0]

In [12]:
#df2.iloc[7][0]
trainigdf = df1[0:labeledData]
trainigdf['reviewText'] = df2[0:labeledData]
trainigdf['lbl'] = lbl


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [13]:
train,test = train_test_split(trainigdf, test_size=0.12, random_state=42)

In [14]:
n_features = 2500
print("Extracting tf-idf features....")
tfidf_vectorizer = TfidfVectorizer(max_df=0.98, min_df=2,
                                   ngram_range=(1, 1),
                                   max_features=n_features,
                                   stop_words='english',
                                   sublinear_tf=False,
                                   tokenizer=my_tokenizer)
X_train_tfidf = tfidf_vectorizer.fit_transform(train['reviewText'])
X_test_tfidf = tfidf_vectorizer.fit_transform(test['reviewText'])
print("convert train sparse matrix")
sdfTrain = pd.SparseDataFrame(X_train_tfidf)
sdfTrain.fillna(0,inplace=True)

print("convert test sparse matrix")
sdfTest = pd.SparseDataFrame(X_test_tfidf)
sdfTest.fillna(0,inplace=True)
print("Done")

Extracting tf-idf features....
convert train sparse matrix
convert test sparse matrix
Done


In [15]:
len(sdfTrain.iloc[0]),len(sdfTest.iloc[0])

(2500, 2500)

In [16]:
clf = MultinomialNB(alpha=.022)
#clf2 = MLPClassifier(activation='relu',solver='sgd', alpha=1e-4, hidden_layer_sizes=(25,5), random_state=1)
clf.fit(X_train_tfidf,train['lbl'])
pred = clf.predict(X_test_tfidf)

print(metrics.f1_score(test['lbl'], pred, average='macro'))
metrics.confusion_matrix(test['lbl'], pred)

0.410714285714


array([[19,  9],
       [29,  9]])

In [17]:
clf2 = MLPClassifier(activation='logistic',solver='adam', alpha=3*1e-4, hidden_layer_sizes=(25,15), random_state=1,max_iter=5000)
clf2.fit(X_train_tfidf,train['lbl'])
pred = clf2.predict(X_test_tfidf)
print(metrics.f1_score(test['lbl'], pred, average='macro'))
metrics.confusion_matrix(test['lbl'], pred)


0.480555555556


array([[19,  9],
       [25, 13]])

In [18]:
clf3=DecisionTreeClassifier(random_state=19)
clf3.fit(X_train_tfidf,train['lbl'])
pred = clf3.predict(X_test_tfidf)
print(metrics.f1_score(test['lbl'], pred, average='macro'))
print(metrics.confusion_matrix(test['lbl'], pred))

0.482949308756
[[14 14]
 [20 18]]


In [19]:
trainigdf['cuisine']
enc = OneHotEncoder()
trainigdf.iloc[0]['cuisine']

"['Vietnamese', 'Sandwiches', 'Restaurants']"

In [20]:
updateddf = pd.get_dummies(data=trainigdf, columns = ['cuisine','zipCode'] )
len(updateddf.iloc[0])

176

In [28]:
n_features = 5000
print("Extracting tf-idf features....")
tfidf_vectorizer = TfidfVectorizer(max_df=0.98, min_df=2,
                                   ngram_range=(1, 1),
                                   max_features=n_features,
                                   stop_words='english',
                                   sublinear_tf=False,
                                   tokenizer=my_tokenizer)
review_tfidf = tfidf_vectorizer.fit_transform(updateddf['reviewText'])
sparsedf = pd.SparseDataFrame(review_tfidf)
sparsedf.fillna(0,inplace=True)
print("Done")

Extracting tf-idf features....
Done


In [29]:
result=pd.concat([updateddf,sparsedf], axis=1)
result = result.drop(['reviewText'], axis=1)


In [30]:
train,test = train_test_split(result, test_size=0.12, random_state=42)


In [31]:
train_y = np.array(train['lbl'])
test_y = np.array(test['lbl'])

In [32]:

clf4 = MultinomialNB(alpha=.022)
#clf2 = MLPClassifier(activation='relu',solver='sgd', alpha=1e-4, hidden_layer_sizes=(25,5), random_state=1)
clf4.fit(train.drop(['lbl'], axis=1),train_y)
pred = clf4.predict(test.drop(['lbl'], axis=1))

print(metrics.f1_score(test_y, pred, average='macro'))
metrics.confusion_matrix(test_y, pred)

0.696691176471


array([[24,  4],
       [16, 22]])

In [33]:
clf5 = MLPClassifier(activation='logistic',solver='adam', alpha=3*1e-4, hidden_layer_sizes=(25,15), random_state=1,max_iter=5000)
clf5.fit(train.drop(['lbl'], axis=1),train_y)
pred = clf5.predict(test.drop(['lbl'], axis=1))
print(metrics.f1_score(test_y, pred, average='macro'))
metrics.confusion_matrix(test_y, pred)


0.681745120551


array([[22,  6],
       [15, 23]])

In [34]:
clf6=DecisionTreeClassifier(random_state=19)
clf6.fit(train.drop(['lbl'], axis=1),train_y)
pred = clf6.predict(test.drop(['lbl'], axis=1))
print(metrics.f1_score(test_y, pred, average='macro'))
print(metrics.confusion_matrix(test_y, pred))


0.58854767952
[[22  6]
 [21 17]]
