# DataDrive2030 Early Learning Predictors Challenge Solution


## Import important libararies

In [None]:
!pip install eli5 -q

In [None]:
import pandas as pd
import numpy as np
import eli5
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor
warnings.filterwarnings('ignore')

## Reading files


In [None]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
ss = pd.read_csv('SampleSubmission.csv')
ID=test['child_id']

## EDA

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.info()

In [None]:
test.info()

In [None]:
# Preview train
train.head()

In [None]:
# Preview test
test.head()

In [None]:
pd.set_option('display.max_columns',None)
train.head()

## Some Feature Engineering Techniques

In [None]:
train.drop("child_id",axis=1,inplace=True)
test.drop("child_id",axis=1,inplace=True)

In [None]:
train=train.select_dtypes('number')
test=test.select_dtypes('number')

In [None]:
def Agg(Feature):
    for dataset in (train,test):
        for key in ['child_age','id_facility_n']:
            dataset[f'{Feature}_{key}_Mean'] = dataset[key].map(dict(train.groupby(key)[Feature].mean()))
            dataset[f'{Feature}_{key}_Median'] = dataset[key].map(dict(train.groupby(key)[Feature].median()))
            dataset[f'{Feature}_{key}_Quantile10'] = dataset[key].map(dict(train.groupby(key)[Feature].quantile(0.10)))
            dataset[f'{Feature}_{key}_Quantile25'] = dataset[key].map(dict(train.groupby(key)[Feature].quantile(0.25)))
            dataset[f'{Feature}_{key}_Quantile75'] = dataset[key].map(dict(train.groupby(key)[Feature].quantile(0.75)))
            dataset[f'{Feature}_{key}_Quantile90'] = dataset[key].map(dict(train.groupby(key)[Feature].quantile(0.90)))
            dataset[f'{Feature}_{key}_first'] = dataset[key].map(dict(train.groupby(key)[Feature].first()))
            dataset[f'{Feature}_{key}_last'] = dataset[key].map(dict(train.groupby(key)[Feature].last()))
            dataset[f'{Feature}_{key}_var'] = dataset[key].map(dict(train.groupby(key)[Feature].var()))
            dataset[f'{Feature}_{key}_Std'] = dataset[key].map(dict(train.groupby(key)[Feature].std()))
            dataset[f'{Feature}_{key}_Min'] = dataset[key].map(dict(train.groupby(key)[Feature].min()))
            dataset[f'{Feature}_{key}_Max'] = dataset[key].map(dict(train.groupby(key)[Feature].max()))
            dataset[f'{Feature}_{key}_Sum'] = dataset[key].map(dict(train.groupby(key)[Feature].sum()))
            dataset[f'{Feature}_{key}_Skew'] = dataset[key].map(dict(train.groupby(key)[Feature].skew()))
            dataset[f'{Feature}_{key}_sem'] = dataset[key].map(dict(train.groupby(key)[Feature].sem()))
            dataset[f'{Feature}_{key}_cumsum'] = dataset[key].map(dict(train.groupby(key)[Feature].cumsum()))
            
Agg('target')


In [None]:
#outliers
outlier_list = []
df_train=train

for col in train :
    
    temp_df = df_train[(df_train[col] > df_train[col].mean() + df_train[col].std() * 50) |
                       (df_train[col] < df_train[col].mean() - df_train[col].std() * 50) ]
    if len(temp_df) >0 :
        outliers = temp_df.index.to_list()
        outlier_list.extend(outliers)
        print(col, len(temp_df))

outlier_list = list(set(outlier_list))
print(len(outlier_list))

In [None]:
#outliers
outlier_list = []
df_train=test

for col in test :
    
    temp_df = df_train[(df_train[col] > df_train[col].mean() + df_train[col].std() * 50) |
                       (df_train[col] < df_train[col].mean() - df_train[col].std() * 50) ]
    if len(temp_df) >0 :
        outliers = temp_df.index.to_list()
        outlier_list.extend(outliers)
        print(col, len(temp_df))

outlier_list = list(set(outlier_list))
print(len(outlier_list))

In [None]:
df=test
for col in df.columns:
    if df[col].dtype=='float64': df[col] = df[col].astype('float32')
    if df[col].dtype=='int64': df[col] = df[col].astype('int16')


In [None]:
df=train
for col in df.columns:
    if df[col].dtype=='float64': df[col] = df[col].astype('float32')
    if df[col].dtype=='int64': df[col] = df[col].astype('int16')

In [None]:
total=train.isnull().sum().sort_values(ascending=False)
percent1=train.isnull().sum()/train.isnull().count()*100
percent2=(round(percent1,1)).sort_values(ascending=False)
missing_data=pd.concat([total,percent2],axis=1,keys=['total','%'])
missing_data

In [None]:
total=test.isnull().sum().sort_values(ascending=False)
percent1=test.isnull().sum()/test.isnull().count()*100
percent2=(round(percent1,1)).sort_values(ascending=False)
missing_data=pd.concat([total,percent2],axis=1,keys=['total','%'])
missing_data

In [None]:
train=train.fillna(train.median())
test=test.fillna(test.median())

## Evaluating model

In [None]:
num_cols = list(set(test.select_dtypes('number')).intersection(train.select_dtypes('number')))
y = train.target
X = train[num_cols]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
model=SGDRegressor()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)

    
print(mean_squared_error(y_test, y_pred, squared=False))

## Prepare model and some previews 

In [None]:
eli5.show_weights(model, feature_names=num_cols)

In [None]:
eli5.show_prediction(model, X_test.iloc[1], feature_names=num_cols, show_feature_values=True, top = 5)

In [None]:
eli5.explain_prediction_df(estimator=model, doc=X_test.iloc[1], top = 15)

In [None]:
# Make predictions on the test set
preds = model.predict(test[num_cols])
preds

In [None]:
# Get the top five features for each predictions
top_predictors = []
for i in range(test.shape[0]):
  predictors = eli5.explain_prediction_df(estimator=model, doc=test[num_cols].iloc[i], top = 15)
  top_predictors.append(predictors.feature.tolist())

## Submission

In [None]:
# Submission file preparation
predictors = pd.DataFrame(top_predictors, columns = ['feature_' + str(i) for i in range(1,16)])
predictors['target'] = preds
predictors['child_id'] = ID
predictors = predictors[ss.columns.tolist()]
predictors.head()

In [None]:
predictors.to_csv('sgd.csv', index = False)