In [4]:
import os
import re
from collections import Counter
import pickle
import csv
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeClassifier 
from sklearn import metrics 
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [5]:
all_book_files = ["/Users/anikasethi/decisiontree/data_generation/littlewomen.txt", "data_generation/greatgatsby.txt", "/Users/anikasethi/decisiontree/data_generation/frankenstine.txt"]
text = ""
for file_path in all_book_files:
    with open(file_path, "r") as f:
        text += f.read().lower()  #lowercase conversation
        
# use regex rules to preprocess/fix spacing and convention
text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # take out puncutation; only letters, numbers, spaces
text = re.sub(r"\n", " ", text)
text = " ".join(text.split())

with open("text.txt", "w") as f:
    f.write(text)

print(text)



In [6]:
# now, need to build vocab by splitting up data
# "some text" -> "x" where x is a single predictive character

window_size = 7

with open("/Users/anikasethi/decisiontree/data_generation/processed/data.csv", "w", newline='') as csvfile:
    fields = ["window", "prediction"]
    writer = csv.DictWriter(csvfile, fieldnames = fields)
    writer.writeheader()
    for c in range(len(text)- window_size):
        writer.writerow({"window": text[c:c+window_size], "prediction":text[c+window_size]})

print("preprocessed and segmented data saved successfully!")

preprocessed and segmented data saved successfully!


In [7]:
df = pd.read_csv("/Users/anikasethi/decisiontree/data_generation/processed/data.csv")
df.head()

Unnamed: 0,window,prediction
0,i playi,n
1,playin,g
2,playing,
3,laying,p
4,aying p,i


need to encode character -> number because decision tree library only takes numbers

In [None]:
unique_characters = (set(text))
print(unique_characters)
print(len(unique_characters))

encoding = {}

index = 0
for i in unique_characters:
    encoding[i] = index
    index = index + 1

print("Encoding\n" , encoding)

reverse = {}
for key, value in encoding.items():
    reverse[value] = key

print("Reverse\n", reverse)

{'j', 'p', 'x', 'q', 'a', 'h', '0', '7', '3', '2', 'r', 'c', 'y', '6', 's', 'd', '4', 'l', 'b', 'u', 'f', 'v', '5', ' ', 't', 'i', 'm', '9', '8', 'w', 'e', 'g', '1', 'k', 'o', 'z', 'n'}
37
Encoding
: {'j': 0, 'p': 1, 'x': 2, 'q': 3, 'a': 4, 'h': 5, '0': 6, '7': 7, '3': 8, '2': 9, 'r': 10, 'c': 11, 'y': 12, '6': 13, 's': 14, 'd': 15, '4': 16, 'l': 17, 'b': 18, 'u': 19, 'f': 20, 'v': 21, '5': 22, ' ': 23, 't': 24, 'i': 25, 'm': 26, '9': 27, '8': 28, 'w': 29, 'e': 30, 'g': 31, '1': 32, 'k': 33, 'o': 34, 'z': 35, 'n': 36}
Reverse
 {0: 'j', 1: 'p', 2: 'x', 3: 'q', 4: 'a', 5: 'h', 6: '0', 7: '7', 8: '3', 9: '2', 10: 'r', 11: 'c', 12: 'y', 13: '6', 14: 's', 15: 'd', 16: '4', 17: 'l', 18: 'b', 19: 'u', 20: 'f', 21: 'v', 22: '5', 23: ' ', 24: 't', 25: 'i', 26: 'm', 27: '9', 28: '8', 29: 'w', 30: 'e', 31: 'g', 32: '1', 33: 'k', 34: 'o', 35: 'z', 36: 'n'}


In [9]:
def str_to_int(string):
    ints = []
    for i in string:
        ints.append(encoding[i])
    return ints

df['encoded_prediction'] = df['prediction'].apply(lambda x: encoding[x])
df['encoded_window'] = df['window'].apply(str_to_int)
encoded_df = df.drop(columns=['prediction', 'window'])
print(df)
encoded_df

          window prediction  encoded_prediction                encoded_window
0        i playi          n                  36    [25, 23, 1, 17, 4, 12, 25]
1         playin          g                  31    [23, 1, 17, 4, 12, 25, 36]
2        playing                             23    [1, 17, 4, 12, 25, 36, 31]
3        laying           p                   1   [17, 4, 12, 25, 36, 31, 23]
4        aying p          i                  25    [4, 12, 25, 36, 31, 23, 1]
...          ...        ...                 ...                           ...
1703243  t new e          b                  18  [24, 23, 36, 30, 29, 23, 30]
1703244   new eb          o                  34  [23, 36, 30, 29, 23, 30, 18]
1703245  new ebo          o                  34  [36, 30, 29, 23, 30, 18, 34]
1703246  ew eboo          k                  33  [30, 29, 23, 30, 18, 34, 34]
1703247  w ebook          s                  14  [29, 23, 30, 18, 34, 34, 33]

[1703248 rows x 4 columns]


Unnamed: 0,encoded_prediction,encoded_window
0,36,"[25, 23, 1, 17, 4, 12, 25]"
1,31,"[23, 1, 17, 4, 12, 25, 36]"
2,23,"[1, 17, 4, 12, 25, 36, 31]"
3,1,"[17, 4, 12, 25, 36, 31, 23]"
4,25,"[4, 12, 25, 36, 31, 23, 1]"
...,...,...
1703243,18,"[24, 23, 36, 30, 29, 23, 30]"
1703244,34,"[23, 36, 30, 29, 23, 30, 18]"
1703245,34,"[36, 30, 29, 23, 30, 18, 34]"
1703246,33,"[30, 29, 23, 30, 18, 34, 34]"


test/train split

In [10]:
X = pd.DataFrame(df['encoded_window'].tolist()) # feature
y = encoded_df["encoded_prediction"] #prediction/target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=11, test_size = 0.2)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1362598, 7)
(340650, 7)
(1362598,)
(340650,)


decision trees

In [11]:
clf = DecisionTreeClassifier(max_depth=35, random_state=20)
clf = clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
y_train_pred = clf.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)

print("testing accuracy:", accuracy)
print("training accuracy:", train_accuracy)


testing accuracy: 0.6026185234111258
training accuracy: 0.7309756802813449


In [12]:
def new_words(model, start, length = 100):
    rv = start
    window_size = len(start)
    while len(rv) < length:
        last_char = rv[-window_size:]
        X_inp = [[encoding[ch] for ch in last_char]]
        output_label = model.predict(X_inp)
        rv += reverse[output_label[0]]
    return rv


In [13]:
print("test 1: ", new_words(clf, "this sl"))
print("test 2: ", new_words(clf, "promise"))
print("test 3: ", new_words(clf, " disk j"))
print("test 4: ", new_words(clf, "celesti"))
print("test 5: ", new_words(clf, "ancdefg"))


test 1:  this sleep and every one was considered a short and she was a little shore and the street and she wa
test 2:  promise that i was a little shore and the street and she was a little shore and the street and she w
test 3:   disk just say i was a little shore and the street and she was a little shore and the street and she
test 4:  celestial city had read and a little shore and the street and she was a little shore and the street 
test 5:  ancdefgives her pretty things and the street and she was a little shore and the street and she was a


random forest

In [21]:
rf_clf = RandomForestClassifier(random_state=20, n_estimators=100)
rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
y_train_pred = rf_clf.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)

print("test accuracy:", accuracy)
print("train accuracy:", train_accuracy)


test accuracy: 0.6106267429913401
train accuracy: 0.7312751082857893


In [23]:
print("test 1: ", new_words(rf_clf, "this sl"))
print("test 2: ", new_words(rf_clf, "promise"))
print("test 3: ", new_words(rf_clf, " disk j"))
print("test 4: ", new_words(rf_clf, "celesti"))
print("test 5: ", new_words(rf_clf, "ancdefg"))

test 1:  this sleeves up the spirit of the street and she was a little souls they sat together and the street
test 2:  promise to learn to see the same time to write to me and i was a little souls they sat together and 
test 3:   disk just sat and the street and she was a little souls they sat together and the street and she wa
test 4:  celestial city i easily be describe the same time to write to me and i was a little souls they sat t
test 5:  ancdefgr and the street and she was a little souls they sat together and the street and she was a li
