In [2]:
# Import the pandas package, then use the "read_json" function to read
# the labeled training data
# WARNING: Training the model takes about 10-25 mins, because vocab size was set to 10 000 and number of trees to 100
import pandas as pd  
import re     
import nltk
import json
import numpy
from nltk.corpus import stopwords

train = pd.read_json('train.json',encoding='utf-8')


def text_to_words( raw_text ):
    # Function to convert a raw text to a string of words
    # The input is a single string (a raw text), and 
    # the output is a single string (a preprocessed text)
    #
    # 
    #
    # 1. Remove non-letters        
    letters_only = re.sub("[^а-яА-Я]", " ", raw_text) 
   

    # 2. Convert to lower case, split into individual words
    words = letters_only.lower().split()
    

                                 
    #
    # 3. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("russian"))                  
    # 
    # 4. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 5. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))   

# Get the number of texts based on the dataframe column size
num_texts = train["text"].size


# Initialize an empty list to hold the clean texts
print("Cleaning and parsing the training set texts...\n")
clean_train_texts = []

# Loop over each review; create an index i that goes from 0 to the length
# of the text list 
for i in range( 0, num_texts ):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
    	print("Text %d of %d\n" % ( i+1, num_texts ))                                                                    
    clean_train_texts.append( text_to_words( train["text"][i] ))


print("Creating the bag of words...\n")

from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 10000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_train_texts)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()
print(train_data_features.shape)

# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
print(vocab)

print("Training the random forest...")
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest = forest.fit( train_data_features, train["sentiment"] )
# Read the test data



test = pd.read_json('test.json',encoding='utf-8')

#test = pd.read_csv("testData.tsv", header=0, delimiter="\t", \
#                   quoting=3 )

# Verify that there are 25,000 rows and 2 columns
print(test.shape)

# Create an empty list and append the clean reviews one by one
num_texts = len(test["text"])
clean_test_texts = [] 

print("Cleaning and parsing the test set movie reviews...\n")
for i in range(0,num_texts):
    if( (i+1) % 1000 == 0 ):
        print("Review %d of %d\n" % (i+1, num_texts))
    clean_text = text_to_words( test["text"][i] )
    clean_test_texts.append( clean_text )

# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_texts)
test_data_features = test_data_features.toarray()

# Use the random forest to make sentiment label predictions
result = forest.predict(test_data_features)





Cleaning and parsing the training set texts...

Text 1000 of 8263

Text 2000 of 8263

Text 3000 of 8263

Text 4000 of 8263

Text 5000 of 8263

Text 6000 of 8263

Text 7000 of 8263

Text 8000 of 8263

Creating the bag of words...

(8263, 10000)
['абай', 'абая', 'абдибеков', 'абдирович', 'абзац', 'абишева', 'абишевич', 'абишевича', 'аблай', 'аблязов', 'абонентов', 'абонентских', 'абр', 'абсолютно', 'абсолютному', 'абсолютным', 'абу', 'аварии', 'аварийно', 'аварийных', 'авария', 'август', 'августа', 'августе', 'авиакомпании', 'авиакомпаний', 'авиакомпания', 'авиаперевозок', 'авиации', 'авиационного', 'авиационный', 'австрии', 'авт', 'авто', 'автобизнеса', 'автобус', 'автобуса', 'автобусах', 'автобусной', 'автобусов', 'автобусы', 'автоваз', 'автоваза', 'автодорог', 'автодороги', 'автозаводов', 'автокомпонентов', 'автокредитов', 'автокредитования', 'автоматизации', 'автоматизация', 'автоматически', 'автомашин', 'автомобилей', 'автомобилем', 'автомобилестроения', 'автомобили', 'автомобиль', 

(2056, 2)
Cleaning and parsing the test set movie reviews...

Review 1000 of 2056

Review 2000 of 2056



In [4]:
testing=pd.read_csv('test.csv', encoding='utf-8')
training=pd.read_csv('train.csv', encoding='utf-8')


In [9]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score




x_train = training['text']
y_train = training['sentiment']
x_test = testing['text']
y_test = testing['sentiment']

classifier1 = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB(alpha=0.00000001))])
classifier3 = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SVC(gamma=10,kernel='poly'))])

classifier1.fit(x_train, y_train)
predicted = classifier1.predict(x_test)
classifier3.fit(x_train, y_train)
predicted3 = classifier3.predict(x_test)
# get the accuracy
print ("1. SVM: ", "%.2f" % (accuracy_score(y_test, predicted3)*100), "%")
print ("NaiveBayes Multinomial:", "%.2f" % (accuracy_score(y_test, predicted)*100), "%")



1. SVM:  82.34 %
NaiveBayes Multinomial: 82.15 %
