# IMPORTED LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = [40, 20]

#%matplotlib inline
plt.style.use('seaborn-whitegrid')

# READING THE DATA

In [2]:
missing_values = ['0', 'nA', '#NUM!']
training_data = pd.read_csv("data/income-train.csv", na_values=missing_values)
test_data = pd.read_csv("data/income-test.csv", na_values=missing_values)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
rename_cols = {"Crime Level in the City of Employement" : "Crime Level",
              "Work Experience in Current Job [years]" : "Work Experience",
              "Body Height [cm]" : "Body Height",
              "Yearly Income in addition to Salary (e.g. Rental Income)" : "Side Income",
              "Total Yearly Income [EUR]" : "Total Income"}

training_data.rename(columns=rename_cols, inplace=True)

In [None]:
#training_data.head()

# Training Data Pre-processing

## - Dropping/Changing Rows/Columns

In [4]:
training_data.drop(columns=['Instance'], inplace=True)

In [5]:
training_data.drop_duplicates(inplace=True)

In [6]:
training_data.dropna(subset=['Year of Record', 'Country'], inplace=True)

In [7]:
# these are the outliers: the rows that don't go well with the graph of 'Year of Record' x 'Total Income' and 'Housing situation' x 'Total Income'
training_data.drop(index=list(training_data[(training_data['Total Income'] > 500000) & (training_data['Total Income'] < 700000) & (training_data['Year of Record'] < 1995)].index), inplace=True)

In [8]:
training_data['Gender'] = training_data['Gender'].replace('f', 'female')

In [22]:
training_data['Side Income'] = training_data['Side Income'].map(lambda x: x.rstrip('\sEUR'))

## - Dealing with NaN values

In [9]:
training_data['Housing Situation'] = training_data['Housing Situation'].fillna(value='shared_place')

In [10]:
training_data['Crime Level'] = training_data['Crime Level'].fillna(value=0)

In [11]:
training_data['Work Experience'] = training_data['Work Experience'].fillna(value='0')

In [12]:
training_data['Satisfation with employer'] = training_data['Satisfation with employer'].fillna(value='Somewhat Unhappy')

In [13]:
training_data['Gender'] = training_data['Gender'].fillna(value='unknown')

In [14]:
training_data['Profession'] = training_data['Profession'].fillna(value='no job')

In [15]:
training_data['University Degree'] = training_data['University Degree'].fillna(value='No')

In [16]:
training_data['Wears Glasses'] = training_data['Wears Glasses'].fillna(value=0)

In [17]:
training_data['Hair Color'] = training_data['Hair Color'].fillna(value='Bold')

## - Handling of Column Data Types

In [18]:
training_data['Work Experience'] = pd.to_numeric(training_data['Work Experience'])

In [25]:
training_data['Side Income'] = pd.to_numeric(training_data['Side Income'])

In [26]:
training_data.dtypes

Year of Record               float64
Housing Situation             object
Crime Level                  float64
Work Experience              float64
Satisfation with employer     object
Gender                        object
Age                            int64
Country                       object
Size of City                   int64
Profession                    object
University Degree             object
Wears Glasses                float64
Hair Color                    object
Body Height                    int64
Side Income                  float64
Total Income                 float64
dtype: object

## - Encoding Categorical Features

In [None]:
#for col in train.dtypes[train.dtypes == 'object'].index.tolist():
#    feat_le = LabelEncoder()
#    feat_le.fit(data[col].unique().astype(str))
#    data[col] = feat_le.transform(data[col].astype(str))

## - Graphs

In [None]:
#sns.catplot(x="Satisfation with employer", y="Total Income", data=training_data[["Satisfation with employer", "Total Income"]])

In [None]:
#sns.countplot(training_data['Work Experience'])

In [None]:
#sns.relplot(x="Work Experience", y="Total Income", data=training_data[["Work Experience", "Total Income"]])

In [None]:
#training_data['Crime Level'].value_counts(dropna=False)

In [None]:
#sns.countplot(training_data['Crime Level'])

In [None]:
#sns.relplot(x="Crime Level", y="Total Income", data=training_data[["Crime Level", "Total Income"]]);

In [None]:
#training_data['Housing Situation'].value_counts(dropna=False)

In [None]:
#sns.countplot(training_data['Housing Situation'])

In [None]:
#sns.catplot(x="Housing Situation", y="Total Income", data=training_data[["Housing Situation", "Total Income"]])

In [None]:
#sns.countplot(training_data['Year of Record'])

In [None]:
#sns.relplot(x="Year of Record", y="Total Income", data=training_data[["Year of Record", "Total Income"]]);

## - Side Work

In [28]:
training_data.corr()

Unnamed: 0,Year of Record,Crime Level,Work Experience,Age,Size of City,Wears Glasses,Body Height,Side Income,Total Income
Year of Record,1.0,-0.000781,0.000122,-0.000184,5e-05,0.00059,0.000216,0.001076,0.643563
Crime Level,-0.000781,1.0,0.191978,0.197523,9.8e-05,0.001556,-0.000938,-0.000876,-0.026944
Work Experience,0.000122,0.191978,1.0,0.968826,-0.000541,0.000118,0.055527,0.00031,0.108466
Age,-0.000184,0.197523,0.968826,1.0,-0.000404,0.000135,0.001067,0.000181,0.10367
Size of City,5e-05,9.8e-05,-0.000541,-0.000404,1.0,-0.000889,-0.001228,-0.001425,0.024541
Wears Glasses,0.00059,0.001556,0.000118,0.000135,-0.000889,1.0,0.000558,-0.001407,0.000779
Body Height,0.000216,-0.000938,0.055527,0.001067,-0.001228,0.000558,1.0,0.001601,0.021191
Side Income,0.001076,-0.000876,0.00031,0.000181,-0.001425,-0.001407,0.001601,1.0,0.205381
Total Income,0.643563,-0.026944,0.108466,0.10367,0.024541,0.000779,0.021191,0.205381,1.0
