# IMPORTED LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import preprocessing

plt.rcParams['figure.figsize'] = [50, 60]

#%matplotlib inline
plt.style.use('seaborn-whitegrid')

# READING THE DATA

In [2]:
missing_values = ['0', 'nA', '#NUM!']
training_data = pd.read_csv("data/income-train.csv", na_values=missing_values)
test_data = pd.read_csv("data/income-test.csv", na_values=missing_values)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
rename_cols = {"Crime Level in the City of Employement" : "Crime Level",
              "Work Experience in Current Job [years]" : "Work Experience",
              "Body Height [cm]" : "Body Height",
              "Yearly Income in addition to Salary (e.g. Rental Income)" : "Side Income",
              "Total Yearly Income [EUR]" : "Total Income"}

training_data.rename(columns=rename_cols, inplace=True)

In [None]:
#training_data.head()

# Training Data Pre-processing

## - Dropping/Changing Rows/Columns

In [4]:
training_data.drop(columns=['Instance'], inplace=True)

In [5]:
training_data.drop_duplicates(inplace=True)

In [6]:
# these are the outliers: the rows that don't go well with the graph of 'Year of Record' x 'Total Income' and 'Housing situation' x 'Total Income'
training_data.drop(index=list(training_data[(training_data['Total Income'] > 500000) & (training_data['Total Income'] < 700000) & (training_data['Year of Record'] < 1995)].index), inplace=True)

In [7]:
training_data['Gender'] = training_data['Gender'].replace('f', 'female')

In [8]:
training_data['Side Income'] = training_data['Side Income'].map(lambda x: x.rstrip('\sEUR'))

## - Dealing with NaN values

In [9]:
column_NaN_values = {"Housing Situation" : "shared_place",
                    "Crime Level" : 0,
                    "Work Experience" : "0",
                    "Satisfation with employer" : "Somewhat Unhappy",
                    "Gender" : "unknown",
                    "Profession" : "no job",
                    "University Degree" : "No",
                    "Wears Glasses" : 0,
                    "Hair Color" : "Bold",
                    "Country" : "No",
                    "Year of Record" : 0}

for col in training_data.dtypes.index.tolist():
    if col in column_NaN_values.keys():
        training_data[col] = training_data[col].fillna(value=column_NaN_values[col])


## - Handling of Column Data Types

In [10]:
training_data['Work Experience'] = pd.to_numeric(training_data['Work Experience'])

In [11]:
training_data['Side Income'] = pd.to_numeric(training_data['Side Income'])

## - Encoding Categorical Features

In [12]:
for col in training_data.dtypes[training_data.dtypes == 'object'].index.tolist():
    le = preprocessing.LabelEncoder()
    le.fit(training_data[col].unique())
    training_data[col] = le.transform(training_data[col])

In [13]:
training_data.dtypes

Year of Record               float64
Housing Situation              int64
Crime Level                  float64
Work Experience              float64
Satisfation with employer      int64
Gender                         int64
Age                            int64
Country                        int64
Size of City                   int64
Profession                     int64
University Degree              int64
Wears Glasses                float64
Hair Color                     int64
Body Height                    int64
Side Income                  float64
Total Income                 float64
dtype: object

In [14]:
#training_data.head()

## - Graphs

In [None]:
#sns.catplot(x="Satisfation with employer", y="Total Income", data=training_data[["Satisfation with employer", "Total Income"]])

In [None]:
#sns.countplot(training_data['Work Experience'])

In [None]:
#sns.relplot(x="Work Experience", y="Total Income", data=training_data[["Work Experience", "Total Income"]])

In [None]:
#training_data['Crime Level'].value_counts(dropna=False)

In [None]:
#sns.countplot(training_data['Crime Level'])

In [None]:
#sns.relplot(x="Crime Level", y="Total Income", data=training_data[["Crime Level", "Total Income"]]);

In [None]:
#training_data['Housing Situation'].value_counts(dropna=False)

In [None]:
#sns.countplot(training_data['Housing Situation'])

In [None]:
#sns.catplot(x="Housing Situation", y="Total Income", data=training_data[["Housing Situation", "Total Income"]])

In [None]:
#sns.countplot(training_data['Year of Record'])

In [None]:
#sns.relplot(x="Year of Record", y="Total Income", data=training_data[["Year of Record", "Total Income"]]);

## - Side Work

In [None]:
#training_data.dropna(subset=['Year of Record', 'Country'], inplace=True) #******

In [None]:
training_data.corr()