In [1]:
import pandas as pd
import numpy as np
import re
import math
from statistics import mean
from matplotlib import pyplot as plt
import sklearn.preprocessing as preprocessing
from sklearn import linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [2]:
df = pd.read_csv("profiles.csv")

FileNotFoundError: File b'profiles.csv' does not exist

In [None]:
print(df.axes)

In [None]:
df.job.value_counts()

In [None]:
plt.hist(df.age, bins=54)
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.xlim(16, 70)
plt.show()

In [None]:
plt.hist(df.height, bins=90)
plt.xlabel("Height")
plt.ylabel("Frequency")
plt.xlim(55, 85)
plt.show()

In [None]:
df.income.value_counts()

In [None]:
drink_mapping = {"not at all": 0, "rarely": 1, "socially": 2, "often": 3, "very often": 4, "desperately": 5}
df["drinks_code"] = df.drinks.map(drink_mapping)

In [None]:
smokes_mapping = {"no": 0, "trying to quit": 1, "sometimes": 2, "when drinking": 3, "yes": 4}
df["smokes_code"] = df.smokes.map(smokes_mapping)

In [None]:
drugs_mapping = {"never": 0, "sometimes": 1, "often": 2}
df["drugs_code"] = df.drugs.map(drugs_mapping)

In [None]:
sex_mapping = {"m": 0, "f": 1}
df["sex_code"] = df.sex.map(sex_mapping)

In [None]:
education_mapping = { "dropped out of space camp": 0, "working on space camp": 1, "space camp": 2, "graduated from space camp": 3, 
                     "dropped out of high school": 4, "working on high school": 5, "high school": 6, "graduated from high school": 7, 
                     "dropped out of two-year college": 8, "dropped out of college/university": 9, "working on two-year college": 10, 
                     "two-year college": 11, "graduated from two-year college": 12, "working on college/university": 13, 
                     "college/university": 14, "graduated from college/university": 15, "dropped out of masters program": 16, 
                     "dropped out of law school": 17, "dropped out of ph.d program": 18, "dropped out of med school": 19, 
                     "working on masters program": 20, "working on law school": 21, "masters program": 22, "law school": 23, 
                     "graduated from masters program": 24, "graduated from law school": 25, "working on ph.d program": 26, 
                     "working on med school": 27, "ph.d program": 28, "med school": 29, "graduated from ph.d program": 30, 
                     "graduated from med school": 31,}
df["education_code"] = df.education.map(education_mapping)

# trying to rationalize these into some sort of hierarchy is probably mis-guided

In [None]:
df["age_decade"] = [math.floor(a/10) for a in df["age"]]

In [None]:
df["age_stage"] = [0 if a < 36 else 1 for a in df["age"]]

In [None]:
essay_cols = ["essay0","essay1","essay2","essay3","essay4","essay5","essay6","essay7","essay8","essay9"]

all_essays = df[essay_cols].replace(np.nan, '', regex=True)
print(all_essays)

In [None]:
x = all_essays.apply(lambda x: ' '.join(x), axis=1)
#print(x)

In [None]:
essay_len = x.apply(lambda x: len(x))
df['essay_len'] = essay_len

In [None]:
means1 = [0 if re.search("[a-zA-Z]+", s) == None else mean([len(w) for w in re.findall("[a-zA-Z]+", s)]) for s in x]
#print(means1)

In [None]:
df.iloc[13]

In [None]:
df['word_len'] = means1

In [None]:
words_lists = [re.findall("[a-zA-Z]+", s) for s in x]

In [None]:
me_counts = [l.count("I") + l.count("i") + l.count("me") + l.count("Me") + l.count("ME") for l in words_lists]
#print(me_counts)
df['I_me_counts'] = me_counts

In [None]:
feature_data0 = df[['smokes_code', 'drinks_code', 'drugs_code', 'essay_len', 'word_len','I_me_counts','education_code','income','sign']]
feature_data0 = feature_data0.dropna()

In [None]:
feature_data = feature_data0[['smokes_code', 'drinks_code', 'drugs_code', 'essay_len', 'word_len','I_me_counts','education_code','income']]
#print(feature_data)
x = feature_data.values

In [None]:
feature_data1 = feature_data0[['smokes_code', 'drinks_code', 'drugs_code', 'essay_len', 'word_len','I_me_counts','education_code']]
x1 = feature_data1.values

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
#print(x_scaled)
feature_data = pd.DataFrame(x_scaled, columns=feature_data.columns)
#print(feature_data)

In [None]:
x1_scaled = min_max_scaler.fit_transform(x1)
feature_data1 = pd.DataFrame(x1_scaled, columns=feature_data1.columns)

In [None]:
plt.xlabel("Scaled Education Code")
plt.ylabel("Scaled Essay Length")
plt.scatter(feature_data['education_code'], feature_data['essay_len'], alpha=0.1)

In [None]:
plt.xlabel("Scaled Education Code")
plt.ylabel("Scaled I/me Counts")
plt.scatter(feature_data['education_code'], feature_data['I_me_counts'], alpha=0.1)

In [None]:
plt.xlabel("Education Code")
plt.ylabel("I/me Counts")
plt.scatter(df['education_code'], df['I_me_counts'], alpha=0.1)

In [None]:
plt.xlabel("Drugs Code")
plt.ylabel("Drinks Code")
plt.scatter(df['drugs_code'], df['drinks_code'], alpha=0.005)

In [None]:
plt.xlabel("Education Code")
plt.ylabel("Word Length")
plt.ylim(0, 20)
plt.scatter(df['education_code'], df['word_len'], alpha=0.1)
# The distribution of word lengths _do_ stretch into the higher ranges for
# those who graduated from college and graduated from a masters program (codes 15 and 24).
# Hard to say much else about it.

In [None]:
feature_data2 = df[['smokes_code', 'drinks_code', 'drugs_code', 'essay_len', 'word_len','I_me_counts','education_code','income']]
feature_data2 = feature_data2.dropna()
feature_data2 = feature_data2[feature_data2['income'] >= 0]
feature_data2b = feature_data2[['smokes_code', 'drinks_code', 'drugs_code', 'essay_len', 'word_len','I_me_counts','education_code']]
scaled_feature_data2b = min_max_scaler2.fit_transform(feature_data2b.values)

regr = linear_model.LinearRegression()
model = regr.fit(scaled_feature_data2b, feature_data2['income'])

In [None]:
#income_predict = regr.predict(scaled_feature_data2b)

print(model.coef_)
print(regr.score(scaled_feature_data2b, feature_data2['income']))

In [None]:
classifier = KNeighborsClassifier(n_neighbors=45)
model = classifier.fit(feature_data, feature_data0['sign'])

In [None]:
zodiac_predict = classifier.predict(feature_data)

In [None]:
accuracy = accuracy_score(feature_data0['sign'], zodiac_predict)
print(accuracy)
recall = recall_score(feature_data0['sign'], zodiac_predict, average=None)
print(recall)
precision = precision_score(feature_data0['sign'], zodiac_predict, average=None)
print(precision)
f1 = f1_score(feature_data0['sign'], zodiac_predict, average=None)
print(f1)

In [None]:
feature_data10 = df[['essay_len', 'word_len','income']]
feature_data10 = feature_data10.dropna()
min_max_scaler2 = preprocessing.MinMaxScaler()

positive_income_feature_data = feature_data10[feature_data10['income'] > 0]
x2 = min_max_scaler2.fit_transform(positive_income_feature_data.values)

positive_income_feature_data_scaled = pd.DataFrame(x2, columns=positive_income_feature_data.columns)

regr = linear_model.LinearRegression()
model_income_from_essays = regr.fit(positive_income_feature_data_scaled[['essay_len','word_len']], positive_income_feature_data['income'])
income_predict_from_essays = regr.predict(positive_income_feature_data_scaled[['essay_len','word_len']])

print(model_income_from_essays.coef_)
regr_score = regr.score(positive_income_feature_data_scaled[['essay_len','word_len']], positive_income_feature_data['income'])
print(regr_score)
print(income_predict_from_essays)
print(positive_income_feature_data['income'])

# Multiple Linear Regression prediction of income from essay length and word length sucks! 
# Not enough data since less than 12K rows reported income - which is only 20% of the dataset
# If it is this bad on the training data then there is no reason to go back and do a test-train split (like I should have)

In [None]:
feature_data11 = df[['I_me_counts','age','age_decade']]
feature_data11 = feature_data11.dropna()
feature_data11 = feature_data11[feature_data11['I_me_counts'] > 0]

features = feature_data11[['I_me_counts']]
labels = feature_data11[['age']]

features_scaled = pd.DataFrame(min_max_scaler2.fit_transform(features.values), columns=features.columns)

regr = linear_model.LinearRegression()
model_age_from_essays = regr.fit(features_scaled, labels)
age_predict_from_essays = regr.predict(features_scaled)

print(model_age_from_essays.coef_)
regr_score = regr.score(features_scaled, labels)
print(regr_score)
print(age_predict_from_essays)
#print(labels)

# Linear Regression prediction of age from essay usage of "I" and "me" sucks!
# simply no significant correlation as can be seen from the following graph 
# (since this is a trivial model with just a single feature).

In [None]:
plt.xlabel("I/me Counts")
plt.ylabel("Age")
plt.scatter(feature_data11['I_me_counts'], feature_data11['age'], alpha=0.2)

In [None]:
plt.xlim(0, 10000)
plt.hist(df.essay_len, bins=1000)
plt.xlabel("essay_len")
plt.ylabel("Frequency")
plt.show()

In [None]:
feature_data11a = df[['essay_len','word_len','I_me_counts','age','age_decade']]
feature_data11a = feature_data11a.dropna()
feature_data11a = feature_data11a[feature_data11a['essay_len'] > 50]
feature_data11a = feature_data11a[feature_data11a['word_len'] > 0]
feature_data11a = feature_data11a[feature_data11a['I_me_counts'] > 0]

features = feature_data11a[['essay_len','word_len','I_me_counts']]
labels = feature_data11a[['age']]

features_scaled = pd.DataFrame(min_max_scaler2.fit_transform(features.values), columns=features.columns)

regr = linear_model.LinearRegression()
model_age_from_essays = regr.fit(features_scaled, labels)
age_predict_from_essays = regr.predict(features_scaled)

print(model_age_from_essays.coef_)
regr_score = regr.score(features_scaled, labels)
print(regr_score)
#print(age_predict_from_essays)
#print(labels)

rmse = np.sqrt(((age_predict_from_essays - labels) ** 2).mean())
print("------------------")
print(rmse)
# Multiple Linear Regression prediction of age from all 3 essay features still sucks and
# yet 5 times better than simply using I_me_counts!
# It is mildly interesting that the coefficient for I_me_counts is negative suggesting 
# that greater usage of "I" and "me" in this model is indicative of younger ages. This 
# is also supported by the graph above.

In [None]:
print(age_predict_from_essays.max())
s = 1000
for i in range(s, s+ 100):
    print('{0} | {1}'.format(labels.iloc[i].age, age_predict_from_essays[i]))

In [None]:
feature_data11b = df[['essay_len','word_len','I_me_counts','education_code','age']]
feature_data11b = feature_data11b.dropna()
feature_data11b = feature_data11b[feature_data11b['essay_len'] > 50]
feature_data11b = feature_data11b[feature_data11b['word_len'] > 0]
feature_data11b = feature_data11b[feature_data11b['I_me_counts'] > 0]

features = feature_data11b[['essay_len','word_len','I_me_counts','education_code']]
labels = feature_data11b[['age']]

features_scaled = pd.DataFrame(min_max_scaler2.fit_transform(features.values), columns=features.columns)

regr = linear_model.LinearRegression()
model_age_from_essays = regr.fit(features_scaled, labels)
age_predict_from_essays = regr.predict(features_scaled)

print(model_age_from_essays.coef_)
regr_score = regr.score(features_scaled, labels)
print(regr_score)
print(age_predict_from_essays)
print(labels)

# Throw in education and it gets better again. Still lousy though.

In [None]:
feature_data11c = df[['essay_len','word_len','I_me_counts','education_code','age']]
feature_data11c = feature_data11c.dropna()
feature_data11c = feature_data11c[feature_data11c['essay_len'] > 50]
feature_data11c = feature_data11c[feature_data11c['word_len'] > 0]
feature_data11c = feature_data11c[feature_data11c['I_me_counts'] > 0]

x = feature_data11c[['essay_len','word_len','I_me_counts','education_code']]

x_scaled = pd.DataFrame(min_max_scaler2.fit_transform(x.values), columns=x.columns)

y = feature_data11c[['age']]

x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, train_size=0.8, test_size=0.2, random_state=6)


regr = KNeighborsRegressor(n_neighbors = 200, weights = "distance")
model_age_from_essays = regr.fit(x_train, y_train['age'])
age_predict_from_essays = regr.predict(x_test)

regr_score = regr.score(x_test, y_test['age'])
print(regr_score)
#print(age_predict_from_essays)
#print(y_test['age'])

rmse = np.sqrt(((age_predict_from_essays - y_test['age']) ** 2).mean())
print("------------------")
print(rmse)

# Almost 3 times better than simple Linear Regression! But STILL lousy.

In [None]:
feature_data11d = df[['sex_code','education_code','income']]
feature_data11d = feature_data11d.dropna()
feature_data11d = feature_data11d[feature_data11d['income'] > 0]

features = feature_data11d[['sex_code','education_code']]
labels = feature_data11d[['income']]

features_scaled = pd.DataFrame(min_max_scaler2.fit_transform(features.values), columns=features.columns)

regr = linear_model.LinearRegression()
model_age_from_essays = regr.fit(features_scaled, labels)
predict_from_essays = regr.predict(features_scaled)

print(model_age_from_essays.coef_)
regr_score = regr.score(features_scaled, labels)
print(regr_score)
print(age_predict_from_essays)
#print(labels)

# income from education and sex - nothing to report. Interesting that both 
# coefficients are negative but given its lousy predictability I can't say that it means much.

In [None]:
plt.xlabel("Sex Code")
plt.ylabel("Income")
plt.scatter(feature_data11d['sex_code'], feature_data11d['income'], alpha=0.01)

In [None]:
plt.xlabel("Education Code")
plt.ylabel("Income")
plt.ylim(0, 400000)
plt.scatter(feature_data11d['education_code'], feature_data11d['income'], alpha=0.01)
# Higher salaries _do_ appear to be correlated with codes 15 and 24 which are "graduated 
# from college/university" and "graduated from masters program". There just isn't enough 
# data points for code 31 "graduated from medical school"

In [None]:
feature_data11e = df[['essay_len','word_len','I_me_counts','income','sex']]
feature_data11e = feature_data11e.dropna()
feature_data11e = feature_data11e[feature_data11e['essay_len'] > 50]
feature_data11e = feature_data11e[feature_data11e['word_len'] > 0]
feature_data11e = feature_data11e[feature_data11e['I_me_counts'] > 0]

x = feature_data11e[['essay_len','word_len','I_me_counts','income']]

x_scaled = pd.DataFrame(min_max_scaler2.fit_transform(x.values), columns=x.columns)

y = feature_data11e[['sex']]

x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, train_size=0.8, test_size=0.2, random_state=6)

classifier = KNeighborsClassifier(n_neighbors=50)
model = classifier.fit(x_train, y_train['sex'])
model_predict = model.predict(x_test)
k_data.append(model_score)

accuracy = accuracy_score(y_test, model_predict)
print(accuracy)
recall = recall_score(y_test, model_predict, average=None)
print(recall)
precision = precision_score(y_test, model_predict, average=None)
print(precision)
f1 = f1_score(y_test, model_predict, average=None)
print(f1)

In [None]:
k_data = []
k_data.append(0)
for i in range(1,99):
    classifier = KNeighborsClassifier(n_neighbors=i)
    model = classifier.fit(x_train, y_train['sex'])
    model_score = model.score(x_test, y_test['sex'])
    k_data.append(model_score)

plt.plot(range(0,len(k_data)), k_data)

# prediction is only slightly better than half! In other words - useless.

In [None]:
feature_data11e = df[['essay_len','word_len','I_me_counts','income','age_decade']]
feature_data11e = feature_data11e.dropna()
feature_data11e = feature_data11e[feature_data11e['essay_len'] > 50]
feature_data11e = feature_data11e[feature_data11e['word_len'] > 0]
feature_data11e = feature_data11e[feature_data11e['I_me_counts'] > 0]

x = feature_data11e[['essay_len','word_len','I_me_counts','income']]

x_scaled = pd.DataFrame(min_max_scaler2.fit_transform(x.values), columns=x.columns)

y = feature_data11e[['age_decade']]

x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, train_size=0.8, test_size=0.2, random_state=6)

classifier = KNeighborsClassifier(n_neighbors=50)
model = classifier.fit(x_train, y_train['age_decade'])
model_predict = model.predict(x_test)
k_data.append(model_score)

accuracy = accuracy_score(y_test, model_predict)
print(accuracy)
recall = recall_score(y_test, model_predict, average=None)
print(recall)
precision = precision_score(y_test, model_predict, average=None)
print(precision)
f1 = f1_score(y_test, model_predict, average=None)
print(f1)

In [None]:
k_data = []
k_data.append(0)
for i in range(1,99):
    classifier = KNeighborsClassifier(n_neighbors=i)
    model = classifier.fit(x_train, y_train['age_decade'])
    model_score = model.score(x_test, y_test['age_decade'])
    k_data.append(model_score)

plt.plot(range(0,len(k_data)), k_data)

In [None]:
y_test.age_decade.value_counts()

In [None]:
print(model_predict.min())
print(model_predict.max())
model_predict2 = model.predict(x_train)
print(model_predict2.min())
print(model_predict2.max())

In [None]:
essay_cols = ["essay0","essay1","essay2","essay3","essay4","essay5","essay6","essay7","essay8","essay9"]
all_essays = df[essay_cols].replace(np.nan, '', regex=True)
x = all_essays.apply(lambda x: ' '.join(x), axis=1)
df['essays_joined'] = x

In [None]:
means1 = [[w for w in re.findall("[a-zA-Z]+", s)] for s in x]

words_lists = [re.findall("[a-zA-Z][a-zA-Z][a-zA-Z][a-zA-Z][a-zA-Z]+", l) for l in x]
unique_words = list(set(x for l in words_lists for x in l))

# now use this in a NB classifier to see if we can predict sex or education

In [None]:
print(len(unique_words))
essays_long_words = [" ".join(l) for l in words_lists]
df['essays_long_words'] = essays_long_words

In [None]:
feature_data12 = df[['essay_len','essays_long_words','sex']]
feature_data12 = feature_data12.dropna()
feature_data12 = feature_data12[feature_data12['essay_len'] > 50]

x_train, x_test, y_train, y_test = train_test_split(feature_data12[['essay_len','essays_long_words']], feature_data12[['sex']], 
                                                    train_size=0.8, test_size=0.2, random_state=6)
counter = CountVectorizer()
counter.fit(unique_words)

training_counts = counter.transform(x_train['essays_long_words'])

test_counts = counter.transform(x_test['essays_long_words'])
classifier = MultinomialNB()

#print(training_counts)
#print(y_train)
classifier.fit(training_counts, y_train['sex'])

model_predict = classifier.predict(test_counts)
accuracy = accuracy_score(y_test, model_predict)
print(accuracy)
recall = recall_score(y_test, model_predict, average=None)
print(recall)
precision = precision_score(y_test, model_predict, average=None)
print(precision)
f1 = f1_score(y_test, model_predict, average=None)
print(f1)

In [None]:
feature_data13 = df[['essay_len','essays_long_words','education']]
feature_data13 = feature_data13.dropna()
feature_data13 = feature_data13[feature_data13['essay_len'] > 50]

x_train, x_test, y_train, y_test = train_test_split(feature_data13[['essay_len','essays_long_words']], feature_data13[['education']], 
                                                    train_size=0.8, test_size=0.2, random_state=6)

counter = CountVectorizer()
counter.fit(unique_words)

training_counts = counter.transform(x_train['essays_long_words'])

test_counts = counter.transform(x_test['essays_long_words'])

classifier = MultinomialNB()
classifier.fit(training_counts, y_train['education'])
score = classifier.score(test_counts, y_test['education'])
print(score)
model_predict = classifier.predict(test_counts)
accuracy = accuracy_score(y_test, model_predict)
print(accuracy)
recall = recall_score(y_test, model_predict, average=None)
print(recall)
precision = precision_score(y_test, model_predict, average=None)
print(precision)
f1 = f1_score(y_test, model_predict, average=None)
print(f1)

In [None]:
feature_data13a = df[['essay_len','essays_long_words','job']]
feature_data13a = feature_data13a.dropna()
feature_data13a = feature_data13a[feature_data13a['essay_len'] > 50]

x_train, x_test, y_train, y_test = train_test_split(feature_data13a[['essay_len','essays_long_words']], feature_data13a[['job']], 
                                                    train_size=0.8, test_size=0.2, random_state=6)
counter = CountVectorizer()
counter.fit(unique_words)
training_counts = counter.transform(x_train['essays_long_words'])
test_counts = counter.transform(x_test['essays_long_words'])
classifier = MultinomialNB()
classifier.fit(training_counts, y_train['job'])
score = classifier.score(test_counts, y_test['job'])
print(score)

In [None]:
feature_data13b = df[['essay_len','essays_long_words','age_decade']]
feature_data13b = feature_data13b.dropna()
feature_data13b = feature_data13b[feature_data13b['essay_len'] > 50]

x_train, x_test, y_train, y_test = train_test_split(feature_data13b[['essay_len','essays_long_words']], feature_data13b[['age_decade']], 
                                                    train_size=0.8, test_size=0.2, random_state=6)
counter = CountVectorizer()
counter.fit(unique_words)
training_counts = counter.transform(x_train['essays_long_words'])
test_counts = counter.transform(x_test['essays_long_words'])
classifier = MultinomialNB()
classifier.fit(training_counts, y_train['age_decade'])
score = classifier.score(test_counts, y_test['age_decade'])
print(score)

In [None]:
feature_data14 = df[['essay_len','essays_long_words','age_stage']]
feature_data14 = feature_data14.dropna()
feature_data14 = feature_data14[feature_data14['essay_len'] > 50]

x_train, x_test, y_train, y_test = train_test_split(feature_data14[['essay_len','essays_long_words']], feature_data14[['age_stage']], 
                                                    train_size=0.8, test_size=0.2, random_state=6)
counter = CountVectorizer()
counter.fit(unique_words)
training_counts = counter.transform(x_train['essays_long_words'])
test_counts = counter.transform(x_test['essays_long_words'])
classifier = MultinomialNB()
classifier.fit(training_counts, y_train['age_stage'])
score = classifier.score(test_counts, y_test['age_stage'])
print(score)
# Not bad! Word choice has an 82% prediction rate for "35 and under" vs "over 35"

In [None]:
model_predict = classifier.predict(test_counts)
accuracy = accuracy_score(y_test, model_predict)
print(accuracy)
recall = recall_score(y_test, model_predict, average=None)
print(recall)
precision = precision_score(y_test, model_predict, average=None)
print(precision)
f1 = f1_score(y_test, model_predict, average=None)
print(f1)

Recall and precision are both better for the classification "younger".
This is easy to have happen if the NUMBER of mis-identified items are 
about the same for both classifications but the total number of "younger"
items is greater than the total number of "older" items.
The confusion matrix could be like:

|&nbsp;| young    |   old|
|----------------|----------|------|
|predicted young |     17   |     3|
|predicted old   |      3   |     6|

... and result in values roughly approximately to those in the previous output:<br>

|&nbsp;| young    |   old|
|----------------|----------|------|
|recall |     0.8811651   |     0.67666667|
|precision   |      0.87093262   |     0.69694132|




In [None]:
feature_data15 = df[['essay_len','essays_long_words','religion']]
feature_data15 = feature_data15.dropna()
feature_data15 = feature_data15[feature_data15['essay_len'] > 50]

feature_data15["religion_serious"] = ['not too' if (re.search('but not too serious', a) != None) 
     else 'laughing' if (re.search('and laughing', a) != None) 
     else 'very' if (re.search('and very serious', a) != None) 
     else 'somewhat' if (re.search('and somewhat serious', a) != None) 
     else 'no_qualifier' for a in feature_data15["religion"]]

x_train, x_test, y_train, y_test = train_test_split(feature_data15[['essay_len','essays_long_words']], feature_data15[['religion_serious']], 
                                                    train_size=0.8, test_size=0.2, random_state=6)
counter = CountVectorizer()
counter.fit(unique_words)
training_counts = counter.transform(x_train['essays_long_words'])
test_counts = counter.transform(x_test['essays_long_words'])
classifier = MultinomialNB()
classifier.fit(training_counts, y_train['religion_serious'])
score = classifier.score(test_counts, y_test['religion_serious'])
print(score)

In [None]:
feature_data15["religion_base"] = [re.search('(\w+)', a).group(1) for a in feature_data15["religion"]]

In [None]:
feature_data15.religion_base.value_counts()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(feature_data15[['essay_len','essays_long_words']], feature_data15[['religion_base']], 
                                                    train_size=0.8, test_size=0.2, random_state=6)
counter = CountVectorizer()
counter.fit(unique_words)
training_counts = counter.transform(x_train['essays_long_words'])
test_counts = counter.transform(x_test['essays_long_words'])
classifier = MultinomialNB()
classifier.fit(training_counts, y_train['religion_base'])
score = classifier.score(test_counts, y_test['religion_base'])
print(score)
# 34% Not very good but I didn't expect much out of this one honestly. 

In [None]:
# What if we ran a Naive Bayes model on the text for the "religion" column focusing only on the words:
religion_words = ['agnosticism','atheism','other','too','very','somewhat','laughing']
# In other words split it into non-religious (agnosticism, atheism, other - this last because I 
# think people use that when they aren't religious and just don't care) and 
# religious but delineated by level of seriousness

In [None]:
feature_data16 = df[['religion','drugs']]
feature_data16 = feature_data16.dropna()

x_train, x_test, y_train, y_test = train_test_split(feature_data16[['religion']], feature_data16[['drugs']], 
                                                    train_size=0.8, test_size=0.2, random_state=42)
counter2 = CountVectorizer()
counter2.fit(religion_words)
training_counts = counter2.transform(x_train['religion'])
test_counts = counter2.transform(x_test['religion'])
classifier = MultinomialNB()
classifier.fit(training_counts, y_train['drugs'])
# score = classifier.score(test_counts, y_test['drugs'])
# print(score)
# About 81%! Not bad!

model_predict = classifier.predict(test_counts)
accuracy = accuracy_score(y_test, model_predict)
print(accuracy)
recall = recall_score(y_test, model_predict, average=None)
print(recall)
precision = precision_score(y_test, model_predict, average=None)
print(precision)
f1 = f1_score(y_test, model_predict, average=None)
print(f1)

In [None]:
[ i for i in model_predict if i != 'never']

In [None]:
feature_data16.drugs.value_counts()

In [None]:
25167 / (25167 + 5860 + 304)

The above predictor ended up just always predicting 'never' for drug usage! Its accuracy appeared to be reasonably good just because 80.3% of respondents entered 'never'! This is just the kind of situation warned about as to why accuracy alone can't be trusted to give the full story.

In [None]:
feature_data17 = df[['religion','drinks']]
feature_data17 = feature_data17.dropna()

x_train, x_test, y_train, y_test = train_test_split(feature_data17[['religion']], feature_data17[['drinks']], 
                                                    train_size=0.8, test_size=0.2, random_state=6)
counter2 = CountVectorizer()
counter2.fit(religion_words)
training_counts = counter2.transform(x_train['religion'])
test_counts = counter2.transform(x_test['religion'])
classifier = MultinomialNB()
classifier.fit(training_counts, y_train['drinks'])
#score = classifier.score(test_counts, y_test['drinks'])
#print(score)
# About 71%! Also not too shabby.
model_predict = classifier.predict(test_counts)
accuracy = accuracy_score(y_test, model_predict)
print(accuracy)
recall = recall_score(y_test, model_predict, average=None)
print(recall)
precision = precision_score(y_test, model_predict, average=None)
print(precision)
f1 = f1_score(y_test, model_predict, average=None)
print(f1)

In [None]:
model_predict

In [None]:
feature_data17.drinks.value_counts()

In [None]:
[ i for i in model_predict if i != 'socially']

Same problem as above. The model looks good if we only look at the accuracy but it is always just reporting the most common value.