In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import hazm
import os

In [2]:
import nazarkav as nk
data_path = os.path.join(nk.__path__[0], 'data')

In [3]:
hotel_pol = pd.read_csv(os.path.join(data_path, 'hotel-polarity.tsv'), sep='\t')

In [4]:
hotel_comment = hotel_pol['comment'].tolist()

In [5]:
vectorizer = TfidfVectorizer(ngram_range=(1,3),
    tokenizer=nk.Preprocessor(stem=False).tokenize,
    preprocessor=nk.Cleaner().clean,
    max_features=50000)
train_data_features = vectorizer.fit_transform(hotel_comment)
train_data_features.shape

(4000, 599287)

In [6]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X_new = SelectKBest(chi2, k=15000).fit_transform(train_data_features, hotel_pol["c"].tolist())

In [7]:
X_new.shape

(4000, 15000)

In [8]:
# Show 10 most frequent words in dataset
# Word frequence
dist = X_new.sum(axis=0)
# Convert matrix to array
dist = np.squeeze(np.asarray(dist))
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()

sorted(zip(dist, vocab), reverse=True)[:10]

[(182.61772459694166, 'اب سرد نداشت'),
 (124.56303716302727, 'آژانس استفاده نکنید'),
 (122.61893600537513, 'آب هنگام استفاده'),
 (116.38104303573249, 'آب داخل ظرفها'),
 (97.472264390277857, 'اتاق دقت کنید'),
 (88.50502847098916, 'be ma'),
 (63.942342083288985, 'آینده به'),
 (55.917624273951397, 'آرائه خدمات علی'),
 (53.724194178365281, 'آرامش را برای'),
 (53.590067710987846, 'آنا من')]

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation as cv
X_train, X_test, y_train, y_test = cv.train_test_split(
    X_new,
    hotel_pol["c"].tolist(), 
    test_size=0.3, 
    random_state=0)

In [10]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm

# Initialize a Random Forest classifier with 100 trees
clf = svm.LinearSVC()

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run



cv.cross_val_score( clf, X_new, hotel_pol["c"].tolist(), cv=3).mean()


0.90799982891437159

In [11]:
clf = clf.fit( X_train, y_train )
clf.score(X_test, y_test)   

0.89666666666666661

In [12]:
from sklearn.metrics import classification_report
y_pred = clf.predict(X_test)
target_names = ['pos', 'neg']
print(classification_report(y_test, y_pred, target_names=target_names))

             precision    recall  f1-score   support

        pos       0.88      0.92      0.90       599
        neg       0.91      0.88      0.89       601

avg / total       0.90      0.90      0.90      1200

