In [41]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

In [19]:
# function for reading the data files
def build_data_frame(path):
    rows = []
    index = []
    classification =[]
    for file_name, text, classification in read_files(path):
        rows.append({'text':text, 'class': classification})
        index.append(file_name)
        
    data_frame = pd.DataFrame(rows,index=index)
    return data_frame

In [20]:
def read_files(path):
    newline=''
    for root, dir_names, file_names in os.walk(path):
        print('Root folder: {0}'.format(root))
        print('Number of files read: {0}'.format(len(file_names)))
        for file_name in file_names:
            file_path = os.path.join(root,file_name)
            if(os.path.isfile(file_path)):
                #print(file_name)
                if("D" in file_name):
                    label="D"
                elif("R" in file_name):
                    label="R"
                else:
                    label="X"
                lines = []
                f = open(file_path)
                for line in f:
                    lines.append(line.rstrip("\n"))
                f.close()
                content=newline.join(lines)
                yield file_name, content, label

In [21]:
# here I set the path of data set using os.getcwd()
path = os.path.join(os.getcwd(), 'data_set')
# intitalize the empty data frame
data = pd.DataFrame({'text':[],'class':[]})
# call the function to build the data set
data=data.append(build_data_frame(path))

Root folder: /home/sadu/Documents/IE594-Data-Science/Speech-classification/data_set
Number of files read: 856


In [22]:
#Parse file name, reorder columns and print to csv

file_name = pd.Series(data.index.values)
bill, speaker, meta_date, file_name= file_name.str.split('_').str
data['bill_id'] = bill.values
data['speaker_id'] = speaker.values
data['meta_date'] = meta_date.values
data = data[['meta_date', 'speaker_id', 'class', 'bill_id', 'text']]
data.to_csv('clean_data.csv')

In [60]:
#Split data into train and test
train_data, test_data = train_test_split(data, test_size = 0.1, stratify = data['class'])

In [61]:
#Create N-gram features
count_vectorizer = CountVectorizer(ngram_range=(1, 3))
x_train = count_vectorizer.fit_transform(train_data['text'])
y_train = train_data['class'].values

In [62]:
#Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(x_train, y_train)
#Test results
x_test = count_vectorizer.transform(test_data['text'])
y_test = test_data['class'].values
predictions = classifier.predict(x_test)
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[18 21]
 [ 5 42]]
             precision    recall  f1-score   support

          D       0.78      0.46      0.58        39
          R       0.67      0.89      0.76        47

avg / total       0.72      0.70      0.68        86



In [63]:
#SVM Classifier
classifier_svc = LinearSVC()
classifier_svc.fit(x_train, y_train)
predictions_svc = classifier_svc.predict(x_test)
print(classification_report(y_test, predictions_svc))

             precision    recall  f1-score   support

          D       0.62      0.62      0.62        39
          R       0.68      0.68      0.68        47

avg / total       0.65      0.65      0.65        86



In [64]:
#Random Forest Classifier
classifier_rf = RandomForestClassifier()
classifier_rf.fit(x_train, y_train)
predictions_rf = classifier_svc.predict(x_test)
print(classification_report(y_test, predictions_svc))

             precision    recall  f1-score   support

          D       0.62      0.62      0.62        39
          R       0.68      0.68      0.68        47

avg / total       0.65      0.65      0.65        86



In [11]:
#Starting to create matrix for plot
count_vectorizer_plot = CountVectorizer(ngram_range=(3, 3), max_features = 50)
counts_plot = count_vectorizer_plot.fit_transform(data['text'])