In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
pd.set_option('display.max_rows', 500)
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from scipy.stats import pearsonr

In [None]:
df = pd.read_csv('data/responses.csv')
df_original=df.copy()
df.head()

In [None]:
columns=df.columns.tolist()
print(columns)
for col in columns:
    print(col, df[col].dtypes)

In [None]:
#number of null values per category
nulls = df.isnull().sum().sort_values(ascending=False)
nulls.plot(kind='bar', figsize=(25, 5))

In [None]:
#number of null values per entry (more than 1 missing value)
nulls = df.isnull().sum(axis=1)[df.isnull().sum(axis=1)>1].sort_values(ascending=False)
nulls.plot(kind='bar', figsize=(25, 5))

In [None]:
var_of_interest = 'Village - town'
var_of_interest = 'Only child'
df.dropna(subset=[var_of_interest], inplace=True)
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
sns.countplot(y=var_of_interest, data=df, ax=ax[0])
sns.countplot(y=var_of_interest, hue='Gender', data=df, ax=ax[1])
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

In [None]:
#let us clean up stuff and make it numerical to get it ready for comparision, prediction tasks
df=df_original.copy()
print(df.Education.unique())
mapping = {var_of_interest: {'city': 0, 'village': 1}}
df.replace(mapping, inplace=True)

mapping_gender = {"Gender": {'male': 0, 'female': 1}}
df.replace(mapping_gender, inplace=True)

mapping_house = {"House - block of flats": {'block of flats': 0, 'house/bungalow': 1}}
df.replace(mapping_house, inplace=True)

mapping_child = {"Only child": {'no': 0, 'yes': 1}}
df.replace(mapping_child, inplace=True)

mapping_hand = {"Left - right handed": {'left handed': 0, 'right handed': 1}}
df.replace(mapping_hand, inplace=True)

mapping_smoking = {"Smoking": {'never smoked':1, 'tried smoking':2, 'former smoker':3, 'current smoker':4}}
df.replace(mapping_smoking, inplace=True)
#df.Smoking.fillna(0, inplace=True)

mapping_alc = {"Alcohol": {'drink a lot':3, 'social drinker':2, 'never':1}}
df.replace(mapping_alc, inplace=True)
#df.Alcohol.fillna(0, inplace=True)

mapping_punct = {"Punctuality": {'i am always on time':2, 'i am often early':1, 'i am often running late':3}}
df.replace(mapping_punct, inplace=True)
#df.Punctuality.fillna(0, inplace=True)

mapping_lying = {"Lying": {'never':1, 'sometimes':2, 'only to avoid hurting someone':3,'everytime it suits me':4}}
df.replace(mapping_lying, inplace=True)
#df.Lying.fillna(0, inplace=True)

mapping_internet = {"Internet usage": {'few hours a day':3,'most of the day':4,'less than an hour a day':2,'no time at all':1}}
df.replace(mapping_internet, inplace=True)
#df["Internet usage"].fillna(0, inplace=True)

mapping_education = {"Education": {'college/bachelor degree':4, 'secondary school':3, 'primary school':2,'masters degree':5, 'doctorate degree':6, 'currently a primary school pupil':1}}
df.replace(mapping_education, inplace=True)
df.Education.fillna(0, inplace=True)
print(df.Education.unique())

for c in range(2):
    print("Prozent female in",c,"is",df.Gender[df.Gender==1][df[var_of_interest]==c].sum()/(len(df.Gender[df.Gender==0][df[var_of_interest]==c])+df.Gender[df.Gender==1][df[var_of_interest]==c].sum()))

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=4, figsize=(20,5))
df["all"] = ""
data = df.dropna(subset=['Height'])
sns.violinplot(x='Height', y = "all", hue=var_of_interest, data=data, split=True, ax = ax[0]);
data = df.dropna(subset=['Weight'])
sns.violinplot(x='Weight', y = "all", hue=var_of_interest, data=data, split=True, ax = ax[1]);
data = df.dropna(subset=['Age'])
sns.violinplot(x='Age', y = "all", hue=var_of_interest, data=data, split=True, ax = ax[2]);

var_of_int_ser = df[var_of_interest]
plt.hist([df[var_of_int_ser==1].Age.dropna(),df[var_of_int_ser==0].Age.dropna()],
             label=['village','city'], stacked=True, bins=30);
ax[3].legend()

In [None]:
numerics=df.select_dtypes(include=['int64','float64','uint8','int32','bool'])
numeric_columns=numerics.columns.tolist()
numerics["all"] = ""
fig, ax = plt.subplots(nrows=4, ncols=len(columns)//4, figsize=(40,40))
for i in range(len(numeric_columns)):
    data = df.dropna(subset=[numeric_columns[i]])
    try:
        sns.violinplot(x=numeric_columns[i], y = "all", hue=var_of_interest, data=numerics, split=True, ax = ax[i//(len(columns)//4),i%(len(columns)//4)])
    except:
        continue
    ax[i//(len(columns)//4),i%(len(columns)//4)].legend()
plt.show()

In [None]:
#exploratory analysis of correlations between variables. Don't say anything about causations when inspected like this.
print(numerics.corr())
#revenue and budget seems fairly correlated with a Spearman's correlation coefficent of 0.69 (excluding 0-values for revenue and budget in the records)
print()
#pair plot
g=sns.pairplot(data=numerics,y_vars=var_of_interest,x_vars=numeric_columns,kind="reg")
g.fig.set_size_inches(150,15)

In [None]:
numerics.drop(columns=["all"],inplace=True) #careful is happening inPlace for now
numerics.drop(columns=[var_of_interest],inplace=True)

numerics.fillna(numerics.mean(),inplace=True)
print(numerics.mean())
X = np.array(numerics)
print(X)
y = np.array(df[var_of_interest].fillna(0))
X_train, X_dev, y_train, y_dev=train_test_split(X, y, test_size=0.2, stratify=y,random_state=42)
clf_reg = LogisticRegressionCV(Cs=10,cv=5,solver='liblinear',class_weight='balanced',penalty='l1',scoring='roc_auc', random_state=42).fit(X_train,y_train)

In [None]:
y_predict = clf_reg.predict(X_dev)
score = accuracy_score(y_dev, y_predict)
confusion=confusion_matrix(y_dev, y_predict)
print("Accuracy: %f" % (score))
print(confusion)

In [None]:
print(clf_reg.coef_)
sort_index = np.argsort(np.abs(clf_reg.coef_[0]))
sorted_features=[x for y, x in sorted(zip(sort_index, numerics.columns),reverse=True)]
print(sorted_features)

In [None]:
df.corr()[var_of_interest]

In [None]:
df2=df.dropna()
print(pearsonr(df2[var_of_interest], df2.Children))
print(pearsonr(df2[var_of_interest], df2.Age))
pearsonr(df2[var_of_interest], df2.Education)

In [None]:
var_of_interest = 'Loneliness'
df.corr()[var_of_interest]

In [None]:
var_of_interest = 'Happiness in life'
df.corr()[var_of_interest]