### Import modules

In [1]:
import numpy as np
import scipy as sc
import sklearn as sk
import matplotlib.pyplot as plt
import re

### Read the csv file

Text file is not properly delimited. Hence, need to write custom reader

In [2]:
def read_csv(file_name):
    with open(file_name) as f:
        line = f.readlines()        
    # get length of the list
    #print len(line) - 1
    # skip header row
    return line[1:]

### Split the last word out based on ','

In [3]:
def clean_csv(data, delimiter=','):
    clean_data = []
    clean_labels = []
    for line in data:
        # split the line based on delimiter
        words = line.split(delimiter)
        # join words till the last word
        data_line = ' '.join(words[:-1])
        # append to list
        if (data_line != ''):
            clean_data.append( re.sub("[^a-zA-Z ']","",data_line.lower()) )
            clean_labels.append( words[-1][:-1].lower() )
    return np.array( clean_data ),np.array( clean_labels )

In [4]:
# get the data dump
data = read_csv('../lang_data.csv')
clean_data, clean_labels = clean_csv(data)
print clean_data.size, clean_labels.size
print clean_data, clean_labels

2761 2761
['ship shape and bristol fashion' 'know the ropes' 'graveyard shift' ...,
 'gofaster' 'red tape' 'in a pickle'] ['english' 'english' 'english' ..., 'english' 'english' 'english']


### Random split for training, validation and test set

In [5]:
from sklearn.cross_validation import train_test_split
# 70% train, 15% validation, 15% test
x_train, x_test, y_train, y_test = train_test_split(clean_data, clean_labels, test_size=0.3, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state=42)



In [6]:
print x_train.size, x_test.size, x_val.size, y_train.size, y_test.size, y_val.size

1932 414 415 1932 414 415


## Bag of words approach

### Create the Bag of words Vectorizer

In [7]:
print "Creating the bag of words..."
from sklearn.feature_extraction.text import CountVectorizer

# Create vectorizer
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 1500)

# Use fit transform to build the Bag-of-words
train_data_features = vectorizer.fit_transform(x_train).toarray()

Creating the bag of words...


In [8]:
print train_data_features.shape
## Take a look at the words in the vocabulary
##vocab = vectorizer.get_feature_names()
##print vocab

(1932, 1500)


In [9]:
print "Training the random forest..."
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest = forest.fit( train_data_features, y_train )

Training the random forest...


In [10]:
# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(x_test).toarray()
# Use the random forest to make sentiment label predictions
result = forest.predict(test_data_features)

n = np.count_nonzero(result == y_test)
print (np.float32(n)*100/result.size),"%"

96.8599033816 %
