In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, LogisticRegression

from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, recall_score, \
precision_score

In [2]:
! pwd

/Users/nengkuantu/CYCU/CYCU2022/ML/20221104/SolutionForLastHW


# 1. Read Data and Make necessary changes

In [3]:
mydata = pd.read_csv('/Users/nengkuantu/Downloads/new_train.csv')
# mydata = mydata[['age',  'duration', 'campaign', 'pdays', 'previous', 'y']]


In [4]:
mydata['y'] = mydata['y'].map(lambda x: 0 if x == "no" else 1)
mydata.rename({'y': 'label'}, axis = 1, inplace=True)

# 2. understanding  dataset

In [5]:
mydata.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,label
0,49,blue-collar,married,basic.9y,unknown,no,no,cellular,nov,wed,227,4,999,0,nonexistent,0
1,37,entrepreneur,married,university.degree,no,no,no,telephone,nov,wed,202,2,999,1,failure,0
2,78,retired,married,basic.4y,no,no,no,cellular,jul,mon,1148,1,999,0,nonexistent,1
3,36,admin.,married,university.degree,no,yes,no,telephone,may,mon,120,2,999,0,nonexistent,0
4,59,retired,divorced,university.degree,no,no,no,cellular,jun,tue,368,2,999,0,nonexistent,0


In [6]:
mydata.dtypes

age             int64
job            object
marital        object
education      object
default        object
housing        object
loan           object
contact        object
month          object
day_of_week    object
duration        int64
campaign        int64
pdays           int64
previous        int64
poutcome       object
label           int64
dtype: object

In [7]:
set(mydata.job)

{'admin.',
 'blue-collar',
 'entrepreneur',
 'housemaid',
 'management',
 'retired',
 'self-employed',
 'services',
 'student',
 'technician',
 'unemployed',
 'unknown'}

In [8]:
set(mydata.poutcome)

{'failure', 'nonexistent', 'success'}

# 3. correlation between numerical columns and the label ('y').

In [9]:
mydata.corr()

Unnamed: 0,age,duration,campaign,pdays,previous,label
age,1.0,-0.001841,0.003302,-0.032011,0.02067,0.028673
duration,-0.001841,1.0,-0.075663,-0.047127,0.022538,0.400752
campaign,0.003302,-0.075663,1.0,0.053795,-0.079051,-0.065462
pdays,-0.032011,-0.047127,0.053795,1.0,-0.589601,-0.325539
previous,0.02067,0.022538,-0.079051,-0.589601,1.0,0.229759
label,0.028673,0.400752,-0.065462,-0.325539,0.229759,1.0


### duration is mostly related to label among all numerical features.  The next is pdays.

# 4. correlation between non-numerical columns and the label ('y').
-------
- we can not find the correlation coefficients between label and non-numerical columns (features)
- we need to look for features that have big difference in the positive possibilities of the label.
    - For instance, if we are asking if a person's education has anything to do with his/her job, then the job is professor, then this person is highly likely to have advanced degree while the job of taxi driver is not likely to have advanced degree.  Therefore, the variation (or standard deviation) of positive possibilities in a column is a good indicator whether this column is highly correlated to the label (positive or not). 

## 4.1 example of the positive probabilits in a column

In [10]:
tmp = mydata.groupby('job').label.sum()\
        /mydata.groupby('job').label.count()
tmp

job
admin.           0.128699
blue-collar      0.069211
entrepreneur     0.086207
housemaid        0.100585
management       0.114712
retired          0.254758
self-employed    0.108280
services         0.079474
student          0.305204
technician       0.108333
unemployed       0.145363
unknown          0.124528
Name: label, dtype: float64

In [11]:
tmpDF = pd.DataFrame(tmp).reset_index()
tmpDF

Unnamed: 0,job,label
0,admin.,0.128699
1,blue-collar,0.069211
2,entrepreneur,0.086207
3,housemaid,0.100585
4,management,0.114712
5,retired,0.254758
6,self-employed,0.10828
7,services,0.079474
8,student,0.305204
9,technician,0.108333


In [12]:
tmpDF.describe()

Unnamed: 0,label
count,12.0
mean,0.135446
std,0.071592
min,0.069211
25%,0.09699
50%,0.111523
75%,0.132865
max,0.305204


In [13]:
tmpDF.describe()[tmpDF.describe().index == 'std'].iloc[0,0]

0.07159160313363819

In [14]:
tmpDesc = pd.DataFrame(tmpDF.describe())
tmpDesc

Unnamed: 0,label
count,12.0
mean,0.135446
std,0.071592
min,0.069211
25%,0.09699
50%,0.111523
75%,0.132865
max,0.305204


In [15]:
tmpDesc[tmpDesc.index == 'std']

Unnamed: 0,label
std,0.071592


In [16]:
tmpDesc[tmpDesc.index == 'std'].iloc[0, 0]

0.07159160313363819

## 4.2 The standard deviation of positive probabilities for all colums

### 4.2.1 list all positive probabilities for all columns

In [17]:
mydata.dtypes

age             int64
job            object
marital        object
education      object
default        object
housing        object
loan           object
contact        object
month          object
day_of_week    object
duration        int64
campaign        int64
pdays           int64
previous        int64
poutcome       object
label           int64
dtype: object

In [18]:
type(mydata.dtypes)

pandas.core.series.Series

In [19]:
tmp = pd.DataFrame(mydata.dtypes).reset_index()
tmp

Unnamed: 0,index,0
0,age,int64
1,job,object
2,marital,object
3,education,object
4,default,object
5,housing,object
6,loan,object
7,contact,object
8,month,object
9,day_of_week,object


In [20]:
tmp.iloc[0,0]

'age'

In [21]:
tmp.iloc[0,1]

dtype('int64')

In [22]:
### tmp = pd.DataFrame(mydata.dtypes).reset_index()
for i in range(tmp.shape[0]):
    if (tmp.iloc[i, 1] == 'object'):
        current = tmp.iloc[i, 0]
        ratio = mydata.groupby(current).label.sum()\
        /mydata.groupby(current).label.count()
        print('==========', type(ratio))
        print(tmp.iloc[i, 1], ratio, "++++", mydata.groupby(current).label.count())

object job
admin.           0.128699
blue-collar      0.069211
entrepreneur     0.086207
housemaid        0.100585
management       0.114712
retired          0.254758
self-employed    0.108280
services         0.079474
student          0.305204
technician       0.108333
unemployed       0.145363
unknown          0.124528
Name: label, dtype: float64 ++++ job
admin.           8314
blue-collar      7441
entrepreneur     1160
housemaid         855
management       2345
retired          1366
self-employed    1099
services         3196
student           711
technician       5400
unemployed        798
unknown           265
Name: label, dtype: int64
object marital
divorced    0.100952
married     0.101238
single      0.141515
unknown     0.169231
Name: label, dtype: float64 ++++ marital
divorced     3675
married     19953
single       9257
unknown        65
Name: label, dtype: int64
object education
basic.4y               0.103552
basic.6y               0.083646
basic.9y               0.076461

### 4.2.2 collect all standard deviation for all non-numerical columns

In [23]:
### tmp = pd.DataFrame(mydata.dtypes).reset_index()
stdlist = []
for i in range(tmp.shape[0]):
    if (tmp.iloc[i, 1] == 'object'):
        current = tmp.iloc[i, 0]
        ratio = mydata.groupby(current).label.sum()\
        /mydata.groupby(current).label.count()
        ratioDF = pd.DataFrame(ratio)
        ratioDesc = ratioDF.describe()
#         ratioDesc[ratioDesc.index == 'std'].iloc[0, 0]
        stdlist = stdlist + [[tmp.iloc[i, 0], ratioDesc[ratioDesc.index == 'std'].iloc[0, 0] ]]
    
stdlist
stdlistDF = pd.DataFrame(stdlist).reset_index()
stdlistDF.sort_values(1)

Unnamed: 0,index,0,1
5,5,loan,0.000392
4,4,housing,0.003175
8,8,day_of_week,0.006905
1,1,marital,0.033318
2,2,education,0.036605
3,3,default,0.064822
6,6,contact,0.066499
0,0,job,0.071592
7,7,month,0.189328
9,9,poutcome,0.308903


### poutcome has the highest standard deviation, therefore it is the mostly correlated to the label.