In [53]:
#Downloading Relevant Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge

In [54]:
#RMSE Error
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [55]:
#loading the data
data=pd.read_csv("housing.csv")

In [57]:
#taking the relevant columns only
data=data[['latitude','longitude','housing_median_age','total_rooms','total_bedrooms','population','households','median_income','median_house_value','ocean_proximity']]

In [58]:
data.isna().sum()

latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [59]:
data=data.fillna(0)
data.isna().sum()

latitude              0
longitude             0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [60]:
#Adding New Columns
data['rooms_per_household']=data['total_rooms']/data['households']
data['bedrooms_per_room']=data['total_bedrooms']/data['total_rooms']
data['population_per_household']=data['population']/data['households']

In [61]:
#Calculating the mode
data['ocean_proximity'].mode()

0    <1H OCEAN
dtype: object

In [62]:
#Splitting the data
Y=data["median_house_value"]
X=data.drop(['median_house_value'],axis=1)

In [63]:
#Splitting the data into testing, validation and training
X_full_train, X_test,Y_full_train,Y_test = train_test_split(X,Y, test_size=0.2, random_state=42)
X_train,X_val,Y_train,Y_val = train_test_split(X_full_train,Y_full_train, test_size=0.25, random_state=42)

In [64]:
#Calculate the correlation matrix
#Drop the catorigcal values then correlation
X_train_numerical=X_train.drop(['ocean_proximity'],axis=1)
correlation_Matrix = X_train_numerical.corr()

In [65]:
correlation_Matrix

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household
latitude,1.0,-0.925005,0.002477,-0.025914,-0.05973,-0.100272,-0.063529,-0.076805,0.119118,-0.124507,-0.002301
longitude,-0.925005,1.0,-0.099812,0.036449,0.06384,0.09167,0.049762,-0.016426,-0.034814,0.10232,0.011022
housing_median_age,0.002477,-0.099812,1.0,-0.363522,-0.324156,-0.292476,-0.306119,-0.119591,-0.181275,0.129456,0.012167
total_rooms,-0.025914,0.036449,-0.363522,1.0,0.931546,0.853219,0.921441,0.198951,0.168926,-0.194185,-0.029452
total_bedrooms,-0.05973,0.06384,-0.324156,0.931546,1.0,0.87734,0.979399,-0.009833,0.010381,0.078094,-0.034301
population,-0.100272,0.09167,-0.292476,0.853219,0.87734,1.0,0.906841,-0.000849,-0.07621,0.031592,0.064998
households,-0.063529,0.049762,-0.306119,0.921441,0.979399,0.906841,1.0,0.011925,-0.085832,0.058004,-0.032522
median_income,-0.076805,-0.016426,-0.119591,0.198951,-0.009833,-0.000849,0.011925,1.0,0.394154,-0.616617,-0.000454
rooms_per_household,0.119118,-0.034814,-0.181275,0.168926,0.010381,-0.07621,-0.085832,0.394154,1.0,-0.500589,0.001801
bedrooms_per_room,-0.124507,0.10232,0.129456,-0.194185,0.078094,0.031592,0.058004,-0.616617,-0.500589,1.0,-0.002851


In [66]:
#getting the pairs with highest correlation
s = correlation_Matrix.unstack()
so = s.sort_values(kind="quicksort")
so[-20:-11]

total_bedrooms  population        0.877340
households      population        0.906841
population      households        0.906841
households      total_rooms       0.921441
total_rooms     households        0.921441
total_bedrooms  total_rooms       0.931546
total_rooms     total_bedrooms    0.931546
households      total_bedrooms    0.979399
total_bedrooms  households        0.979399
dtype: float64

In [67]:
#calulating the categorical values
Y_cat = (Y_train> Y_train.mean()) .astype(int)


In [68]:
#Calculating the mutual information
round(mutual_info_score(Y_cat, X_train['ocean_proximity']),2)

0.1

In [69]:
#Decoding the daata (Categorical Variable)
dv = DictVectorizer(sparse=False)

X_train_decoded = X_train.to_dict(orient='records')
X_train_decoded= dv.fit_transform(X_train_decoded)
X_train_decoded


array([[2.59713701e-01, 3.74000000e+02, 3.90000000e+01, ...,
        3.92245989e+00, 3.81000000e+02, 1.46700000e+03],
       [1.30227981e-01, 8.06000000e+02, 2.40000000e+01, ...,
        7.56451613e+00, 7.94000000e+02, 6.09700000e+03],
       [2.34624146e-01, 3.37000000e+02, 4.10000000e+01, ...,
        3.90801187e+00, 3.09000000e+02, 1.31700000e+03],
       ...,
       [1.82879377e-01, 6.02000000e+02, 1.80000000e+01, ...,
        5.54983389e+00, 6.11000000e+02, 3.34100000e+03],
       [2.29126214e-01, 3.50000000e+02, 1.60000000e+01, ...,
        4.41428571e+00, 3.54000000e+02, 1.54500000e+03],
       [2.09574468e-01, 2.15000000e+02, 3.50000000e+01, ...,
        4.37209302e+00, 1.97000000e+02, 9.40000000e+02]])

In [70]:
#fitting the logistic regression model
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_decoded,Y_cat)

LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

In [71]:
#testing the model on the validation
#Prepare the validation dataset
Y_cat_val = (Y_val> Y_val.mean()) .astype(int)
X_val_decoded = X_val.to_dict(orient='records')
X_val_decoded= dv.transform(X_val_decoded)


In [72]:
#Calculating the accuracy
round(model.score(X_val_decoded, Y_cat_val),2)

0.84

In [73]:
#Question 5
Columns=X_train.columns
error=[]
deleted_col=[]
# exclude each feature from this set and train a model without it
for i in range(len(Columns)):
  deleted_col.append(Columns[i])
  Selected_Col=[]
  for j in range(len(Columns)):
    if j!=i:
      Selected_Col.append(Columns[j])
  X_full_train, X_test,Y_full_train,Y_test = train_test_split(X,Y, test_size=0.2, random_state=42)
  X_train,X_val,Y_train,Y_val = train_test_split(X_full_train,Y_full_train, test_size=0.25, random_state=42)
  dicts_train_small = X_train[Selected_Col].to_dict(orient='records')
  dicts_val_small = X_val[Selected_Col].to_dict(orient='records')
  dv_small = DictVectorizer(sparse=False)
  dv_small.fit(dicts_train_small)
  X_train_small = dv_small.transform(dicts_train_small)
  X_val_small=dv_small.transform(dicts_val_small)
  model_small = LogisticRegression(solver='lbfgs')
  #calculate the difference between the original accuracy and the accuracy without the feature
  model_small.fit(X_train_small, Y_cat)
  model_small.score(X_val_small, Y_cat_val)
  error.append(abs(model_small.score(X_val_small, Y_cat_val)-0.84))

      
  


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [74]:
deleted_col

['latitude',
 'longitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'ocean_proximity',
 'rooms_per_household',
 'bedrooms_per_room',
 'population_per_household']

In [26]:
error

[-0.017567829457364348,
 -0.017325581395348788,
 -0.04324612403100769,
 -0.011996124031007693,
 -0.012480620155038702,
 -0.032102713178294495,
 -0.00957364341085265,
 -0.08491279069767443,
 -0.03985465116279063,
 -0.031133720930232478,
 -0.03670542635658913,
 -0.036220930232558124]

In [76]:
# feature has the smallest difference
minimum_index=error.index(min(error))
deleted_col[minimum_index]

'households'

In [77]:

#Question6 Ridge Regression
#Calculate the np log
Y_train=np.log1p(Y_train)
Y_val=np.log1p(Y_val)

In [78]:
alpha=np.array([0, 0.01, 0.1, 1, 10])
Error=np.zeros((5))
for i in range(len(alpha)):
  model = Ridge(alpha=alpha[i], solver="sag", random_state=42)
  model.fit(X_train_decoded,Y_train)
  Y_pred=model.predict(X_val_decoded)
  Error[i]=round(rmse(Y_pred,Y_val),3)


In [79]:
Error

array([0.524, 0.524, 0.524, 0.524, 0.524])