In [1]:
# Import the pandas package, then use the "read_json" function to read
# the labeled training data
# WARNING: Training the model takes about 10-25 mins, because vocab size was set to 10 000 and number of trees to 100
import pandas as pd  
import re     
import nltk
import json
import numpy
from nltk.corpus import stopwords

train = pd.read_json('train.json', encoding=('utf-8'))


def text_to_words( raw_text ):
    # Function to convert a raw text to a string of words
    # The input is a single string (a raw text), and 
    # the output is a single string (a preprocessed text)
    #
    # 
    #
    # 1. Remove non-letters        
    letters_only = re.sub("[^а-яА-Я]", " ", raw_text) 
   

    # 2. Convert to lower case, split into individual words
    words = letters_only.lower().split()
    

                                 
    #
    # 3. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("russian"))                  
    # 
    # 4. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 5. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))   

# Get the number of texts based on the dataframe column size
num_texts = train["text"].size


# Initialize an empty list to hold the clean texts
print("Cleaning and parsing the training set texts...\n")
clean_train_texts = []

# Loop over each review; create an index i that goes from 0 to the length
# of the text list 
for i in range( 0, num_texts ):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
    	print("Text %d of %d\n" % ( i+1, num_texts ))                                                                    
    clean_train_texts.append( text_to_words( train["text"][i] ))

print("Creating the bag of words...\n")

from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 10000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_train_texts)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()

# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output = pd.DataFrame( data={"text":clean_train_texts[:2000], "sentiment":train["sentiment"][:2000]} )

# Use pandas to write the comma-separated output file
output.to_csv( "train2.csv", index=False, quoting=3 )



In [10]:
output = pd.DataFrame( data={"text":clean_train_texts[:500], "sentiment":train["sentiment"][:500]} )

# Use pandas to write the comma-separated output file
output.to_csv( "train1.csv", index=False, quoting=3 )