Name: Yash Satra
Roll No: 1811109
Batch: B1

### Importing packages

In [1]:
import sklearn as sk
import pandas as pd
import numpy as np

### Loading dataset

In [2]:
df = pd.read_csv('./datasets/stackoverflow/survey_results_public.csv', usecols=['Age','YearsCode','YearsCodePro','WorkLoc','ConvertedComp','Hobbyist'])

### Dataframe description

The actual stackoverflow dataset comprises of 85 columns but for simplicity 6 columns are selected
1. Age - Age of respondent
2. YearsCode - Years the respondent has been coding
3. YearsCode - Years the respondent has been coding professionally
4. ConvertedComp - Salary of respondent in USD
5. WorkLoc - Working location
6. Hobbyist - Whether coding is respondent's hobby or not

In [3]:
df.head()

Unnamed: 0,Hobbyist,YearsCode,YearsCodePro,ConvertedComp,WorkLoc,Age
0,Yes,4.0,,,,14.0
1,No,,,,,19.0
2,Yes,3.0,1,8820.0,Home,28.0
3,No,3.0,Less than 1 year,61000.0,Home,22.0
4,Yes,16.0,9,,Office,30.0


In [4]:
for column in df.columns:
    print(column)
    print(df[column].unique())

Hobbyist
['Yes' 'No']
YearsCode
['4' nan '3' '16' '13' '6' '8' '12' '2' '5' '17' '10' '14' '35' '7'
 'Less than 1 year' '30' '9' '26' '40' '19' '15' '20' '28' '25' '1' '22'
 '11' '33' '50' '41' '18' '34' '24' '23' '42' '27' '21' '36' '32' '39'
 '38' '31' '37' 'More than 50 years' '29' '44' '45' '48' '46' '43' '47'
 '49']
YearsCodePro
[nan '1' 'Less than 1 year' '9' '3' '4' '10' '8' '2' '13' '18' '5' '14'
 '22' '23' '19' '35' '20' '25' '7' '15' '27' '6' '48' '12' '31' '11' '17'
 '16' '21' '29' '30' '26' '33' '28' '37' '40' '34' '24' '39' '38' '36'
 '32' '41' '45' '43' 'More than 50 years' '44' '42' '46' '49' '50' '47']
ConvertedComp
[    nan   8820.  61000. ...  38766.  13272. 588012.]
WorkLoc
[nan 'Home' 'Office' 'Other place, such as a coworking space or cafe']
Age
[14.  19.  28.  22.  30.  42.  24.  23.   nan 21.  31.  20.  26.  29.
 38.  47.  34.  32.  25.  17.  35.  27.  44.  43.  62.  37.  45.  18.
 33.  36.  16.  39.  64.  41.  54.  49.  40.  56.  12.  58.  46.  59.
 51.  48.  57

### Replacing 'Less than 1 year' by 0 and 'More than 50 years' by 51 in YearsCode and YearsCodePro columns

In [5]:
df['YearsCode'].replace('Less than 1 year', '0', inplace=True)
df['YearsCode'].replace('More than 50 years', '51', inplace=True)

In [6]:
df['YearsCodePro'].replace('Less than 1 year', '0', inplace=True)
df['YearsCodePro'].replace('More than 50 years', '51', inplace=True)

### Replacing nan in the WorkLoc column by 'doesn't work' assuming that the respondent is not working currently hence he/she has no work location

In [7]:
df['WorkLoc'].replace(np.nan,"Doesn't work", inplace=True)

### Converting to np array

In [8]:
x = df.loc[:,['Age','YearsCode','YearsCodePro','WorkLoc','Hobbyist']].values

In [9]:
y = df['ConvertedComp'].values

In [10]:
print(x)
print(y)

[[14.0 '4' nan "Doesn't work" 'Yes']
 [19.0 nan nan "Doesn't work" 'No']
 [28.0 '3' '1' 'Home' 'Yes']
 ...
 [nan nan nan "Doesn't work" 'No']
 [nan nan nan "Doesn't work" 'No']
 [18.0 '8' '3' "Doesn't work" 'Yes']]
[  nan   nan 8820. ...   nan   nan   nan]


### Taking care of missing data

In [11]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

In [12]:
imputer.fit(x[:,0:3])
x[:,0:3]=imputer.transform(x[:,0:3])
print(x)

[[14.0 4.0 8.15634123044221 "Doesn't work" 'Yes']
 [19.0 11.662114216834588 8.15634123044221 "Doesn't work" 'No']
 [28.0 3.0 1.0 'Home' 'Yes']
 ...
 [30.336698649160457 11.662114216834588 8.15634123044221 "Doesn't work"
  'No']
 [30.336698649160457 11.662114216834588 8.15634123044221 "Doesn't work"
  'No']
 [18.0 8.0 3.0 "Doesn't work" 'Yes']]


### In ConvertedComp replacing nan by 0 because those respondents have no salary

In [13]:
imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)

In [14]:
y = y.reshape(-1,1)
imputer.fit(y)
y=imputer.transform(y)
print(y)

[[   0.]
 [   0.]
 [8820.]
 ...
 [   0.]
 [   0.]
 [   0.]]


### Encoding the independent variables WorkLoc and Hobbyist

In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3,4])], remainder='passthrough')
x = np.array(ct.fit_transform(x))
print(x)

[[1.0 0.0 0.0 ... 14.0 4.0 8.15634123044221]
 [1.0 0.0 0.0 ... 19.0 11.662114216834588 8.15634123044221]
 [0.0 1.0 0.0 ... 28.0 3.0 1.0]
 ...
 [1.0 0.0 0.0 ... 30.336698649160457 11.662114216834588 8.15634123044221]
 [1.0 0.0 0.0 ... 30.336698649160457 11.662114216834588 8.15634123044221]
 [1.0 0.0 0.0 ... 18.0 8.0 3.0]]


### Normalization of Age, YearsCode and YearsCodePro

In [16]:
x.shape

(88883, 9)

In [17]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x[:, 6:]= sc.fit_transform(x[: , 6: ])
print(x)

[[1.0 0.0 0.0 ... -1.8854710558560477 -0.8416266814142354
  -3.097702542750127e-13]
 [1.0 0.0 0.0 ... -1.3084049373128603 5.4633511775484314e-15
  -3.097702542750127e-13]
 [0.0 1.0 0.0 ... -0.2696859239351231 -0.9514692989472768
  -1.0220799676198955]
 ...
 [1.0 0.0 0.0 ... 1.0373762506178708e-13 5.4633511775484314e-15
  -3.097702542750127e-13]
 [1.0 0.0 0.0 ... 1.0373762506178708e-13 5.4633511775484314e-15
  -3.097702542750127e-13]
 [1.0 0.0 0.0 ... -1.423818161021498 -0.40225621128206973
  -0.7364368059238653]]
