In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

In [23]:
df = pd.read_csv('train.csv', usecols=['Age','Pclass','SibSp','Parch','Survived'])

In [7]:
# Alternative way to use certain features
# df = pd.read_csv('train.csv')[['Age','Pclass','SibSp','Parch','Survived']]

In [25]:
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch
0,0,3,22.0,1,0
1,1,1,38.0,1,0
2,1,3,26.0,0,0
3,1,1,35.0,1,0
4,0,3,35.0,0,0


In [27]:
# Checking for any missing data
df.isnull().mean() * 100

Survived     0.00000
Pclass       0.00000
Age         19.86532
SibSp        0.00000
Parch        0.00000
dtype: float64

In [29]:
# Since we have missing data in Age column, so we need to remove it
df.dropna(inplace=True)

In [31]:
df.isnull().mean() * 100

Survived    0.0
Pclass      0.0
Age         0.0
SibSp       0.0
Parch       0.0
dtype: float64

In [33]:
X = df.drop(columns=['Survived'])
Y = df['Survived']

In [37]:
np.mean(cross_val_score(LogisticRegression(), X, Y, scoring='accuracy', cv=20))

0.6933333333333332

# Applying Feature Construction

In [44]:
X['Family_size'] = X['SibSp'] + X['Parch'] + 1

In [46]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Family_size
0,3,22.0,1,0,2
1,1,38.0,1,0,2
2,3,26.0,0,0,1
3,1,35.0,1,0,2
4,3,35.0,0,0,1


In [48]:
def myfunc(num):
    if num == 1:
        # alone
        return 0
    elif num > 1 and num <=4:
        # small family
        return 1
    else:
        # large family
        return 2

In [52]:
myfunc(5)

2

In [54]:
X['Family_type'] = X['Family_size'].apply(myfunc)

In [56]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Family_size,Family_type
0,3,22.0,1,0,2,1
1,1,38.0,1,0,2,1
2,3,26.0,0,0,1,0
3,1,35.0,1,0,2,1
4,3,35.0,0,0,1,0


In [60]:
X.drop(columns=['SibSp','Parch','Family_size'], inplace=True)

In [62]:
X.head()

Unnamed: 0,Pclass,Age,Family_type
0,3,22.0,1
1,1,38.0,1
2,3,26.0,0
3,1,35.0,1
4,3,35.0,0


In [66]:
np.mean(cross_val_score(LogisticRegression(), X,Y, scoring='accuracy',cv=20))

0.7003174603174602

# Practice 1

In [69]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression


In [96]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7
Note: you may need to restart the kernel to use updated packages.


In [98]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
iris = fetch_ucirepo(id=53) 

In [110]:
X = iris.data.features 
Y = iris.data.targets 

In [112]:
X.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [114]:
X.isnull().mean()*100

sepal length    0.0
sepal width     0.0
petal length    0.0
petal width     0.0
dtype: float64

In [126]:
np.mean(cross_val_score(LogisticRegression(), X, Y, scoring='accuracy', cv=15))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/mod

0.9733333333333333

In [128]:
X['Sepal_area'] = X['sepal length'] + X['sepal width']
X['petal_area'] = X['petal length'] + X['petal width']

In [132]:
X.drop(columns=['sepal length','sepal width','petal length','petal width'], inplace=True)

In [134]:
X.head()

Unnamed: 0,Sepal_area,petal_area
0,8.6,1.6
1,7.9,1.6
2,7.9,1.5
3,7.7,1.7
4,8.6,1.6


In [142]:
np.mean(cross_val_score(LogisticRegression(), X, Y, scoring='accuracy', cv=5))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.9733333333333334