In [84]:
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# function for reading the data files
def build_data_frame(path):
    rows = []
    index = []
    classification =[]
    for file_name, text, classification in read_files(path):
        rows.append({'text':text, 'class': classification})
        index.append(file_name)
        
    data_frame = pd.DataFrame(rows,index=index)
    return data_frame

In [3]:
def read_files(path):
    newline=''
    for root, dir_names, file_names in os.walk(path):
        print('Root folder: {0}'.format(root))
        print('Number of files read: {0}'.format(len(file_names)))
        for file_name in file_names:
            file_path = os.path.join(root,file_name)
            if(os.path.isfile(file_path)):
                #print(file_name)
                if("D" in file_name):
                    label="D"
                elif("R" in file_name):
                    label="R"
                else:
                    label="X"
                lines = []
                f = open(file_path)
                for line in f:
                    lines.append(line.rstrip("\n"))
                f.close()
                content=newline.join(lines)
                yield file_name, content, label

In [4]:
# here I set the path of data set using os.getcwd()
path = os.path.join(os.getcwd(), 'data_set')
# intitalize the empty data frame
data = pd.DataFrame({'text':[],'class':[]})
# call the function to build the data set
data=data.append(build_data_frame(path))

Root folder: /home/sadu/Documents/IE594-Data-Science/Speech-classification/data_set
Number of files read: 856


In [5]:
#Parse file name, reorder columns and print to csv

file_name = pd.Series(data.index.values)
bill, speaker, meta_date, file_name= file_name.str.split('_').str
data['bill_id'] = bill.values
data['speaker_id'] = speaker.values
data['meta_date'] = meta_date.values
data = data[['meta_date', 'speaker_id', 'class', 'bill_id', 'text']]
data.to_csv('clean_data.csv')

In [65]:
#Split data into train and test
train_data, test_data = train_test_split(data, test_size = 0.2, stratify = data['class'])

In [54]:
#Create N-gram features
count_vectorizer = CountVectorizer(ngram_range=(1, 1))
counts = count_vectorizer.fit_transform(train_data['text'].values)

In [83]:
#Naive Bayes classifier
classifier = MultinomialNB()
targets = train_data['class'].values
classifier.fit(counts, targets)
#Test results
test_counts = count_vectorizer.transform(test_data['text'])
predictions = classifier.predict(test_counts)
print(confusion_matrix(test_data['class'].values, predictions))
classification_report(test_data['class'].values, predictions)

array([[41, 37],
       [40, 54]])