<a href="https://www.kaggle.com/code/atilaysamiloglu/data-science-interview?scriptVersionId=97544202" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from sklearn.metrics import mean_absolute_error, mean_squared_error
import time
import lightgbm as lgb
from xgboost import XGBRegressor

# **DATA**

In [None]:
df = pd.read_csv('../input/interview-data-science/Data_Scientist_Interview_Task.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
# checking null values for all data

df.isnull().sum() /len(df)*100

In [None]:
msno.matrix(df)
plt.show()

In [None]:
df.describe().T

# **PREPARING DATA**


### Deleting gaps in column names


In [None]:
df.columns[20]

In [None]:
df.columns = df.columns.str.strip()

In [None]:
df.columns[20]

### Dropping columns

In [None]:
df.drop(['Claim Number','Unnamed: 46', 'Capped Incurred'], inplace  = True, axis = 1)

In [None]:
df['Incurred'].value_counts()

### **Deleting gaps in dependent value**


In [None]:
df['Incurred'][1]

In [None]:
df['Incurred'] = df['Incurred'].apply(lambda x: x.strip())

### **Deleting symbol (£) in dependent values**


In [None]:
df['Incurred'][1]

In [None]:
df['Incurred'] = df['Incurred'].apply(lambda x :str(x).split('£')[1])

In [None]:
df['Incurred'][1]

### **Converting dependent variable to float**

In [None]:
df['Incurred'] 

In [None]:
df['Incurred'] = df['Incurred'].apply(lambda x: x.replace(',', ''))

In [None]:
df['Incurred']  = pd.to_numeric(df['Incurred'], errors='coerce')

In [None]:
df['Incurred'] 

In [None]:
df['Incurred'].isnull().sum()

### **Deleting outlier values**

In [None]:
df.describe().T

In [None]:
def outlier_thresholds(dataframe, col_name):
    quartile1 = dataframe[col_name].quantile(0.25)
    quartile3 = dataframe[col_name].quantile(0.75)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

outlier_thresholds(df, "Incurred")

low, up = outlier_thresholds(df, "Incurred")

def remove_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    df_without_outliers = dataframe[~((dataframe[col_name] < low_limit) | (dataframe[col_name] > up_limit))]
    return df_without_outliers


df = remove_outlier(df, "Incurred")


In [None]:

df.describe().T

In [None]:
df.dropna(subset=['Incurred'], inplace=True)
#df["Incurred"] = df["Incurred"].fillna(df["Incurred"].mean())


### **Date format for date_of_loss**

In [None]:
df['date_of_loss'] = pd.to_datetime(df['date_of_loss'])

In [None]:
df.info()

### **Categorical variables**

In [None]:
subset = df.columns.difference(['date_of_loss','Notification_period', 'Inception_to_loss','Time_hour', 'Incurred'])
subset

In [None]:
# summary of categorical values and charts

def cat_summary(dataframe, col_name):
    print(' ')
    print('###############################################################')
    print(' ')
    print('###############################################################')
    print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                        "Ratio": 100 * dataframe[col_name].value_counts() / len(dataframe)}))
    print(' ')
    print(' ')
    ax = sns.countplot(x=dataframe[col_name], data=dataframe)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
    plt.tight_layout()
    plt.show()
    

for i in subset:
    cat_summary(df, i)

### **Dropping columns**

In [None]:
df.drop(['Loss_code', 'Loss_description'], inplace = True, axis = True)

In [None]:
subset = df.columns.difference(['date_of_loss','Notification_period', 'Inception_to_loss','Time_hour', 'Incurred'])
subset

### **Rare analysis for rare values**

In [None]:
def rare_encoder(dataframe, rare_perc):
    temp_df = dataframe.copy()

    rare_columns = [col for col in subset if temp_df[col].dtypes == 'O'
                    and (temp_df[col].value_counts() / len(temp_df) < rare_perc).any(axis=None)]

    for var in rare_columns:
        tmp = temp_df[var].value_counts() / len(temp_df)
        rare_labels = tmp[tmp < rare_perc].index
        temp_df[var] = np.where(temp_df[var].isin(rare_labels), 'Rare', temp_df[var])

    return temp_df


new_df = rare_encoder(df, 0.03)

In [None]:
new_df[new_df["TP_region_north"].str.contains("Rare")].head()


In [None]:
for i in subset:
  print(new_df[i].value_counts()/len(new_df),  end="\n\n\n")

In [None]:
df.info()


# **FEATURE ENGINEERING**


In [None]:
def create_date_features(new_df):
    new_df['month'] = new_df.date_of_loss.dt.month
    new_df['day_of_month'] = new_df.date_of_loss.dt.day
    #new_df['day_of_year'] = new_df.date_of_loss.dt.dayofyear
    new_df['week_of_year'] = new_df.date_of_loss.dt.weekofyear
    new_df['day_of_week'] = new_df.date_of_loss.dt.dayofweek + 1
    new_df['year'] = new_df.date_of_loss.dt.year
    new_df["is_wknd"] = new_df.date_of_loss.dt.weekday // 4  
    new_df['is_month_start'] = new_df.date_of_loss.dt.is_month_start.astype(int) 
    new_df['is_month_end'] = new_df.date_of_loss.dt.is_month_end.astype(int)
    return new_df

new_df = create_date_features(new_df)
new_df.drop(['date_of_loss'], inplace = True, axis = 1)

In [None]:
new_df['Notification_period'].value_counts(), new_df['Notification_period'].max()

In [None]:
new_df['new_Notification_period'] = 0
new_df.loc[(new_df['Notification_period'] >= 0)   & (new_df['Notification_period'] < 300), 'new_Notification_period'] = 'Notifi_early'
new_df.loc[(new_df['Notification_period'] >= 200) & (new_df['Notification_period'] < 500), 'new_Notification_period'] = 'Notifi_late_early'
new_df.loc[(new_df['Notification_period'] >= 400) & (new_df['Notification_period'] < 700), 'new_Notification_period'] = 'Notifi_moderate'
new_df.loc[(new_df['Notification_period'] >= 600) & (new_df['Notification_period'] < 900), 'new_Notification_period'] = 'Notifi_late'
new_df.loc[(new_df['Notification_period'] >= 800) & (new_df['Notification_period'] < 1100), 'new_Notification_period'] = 'Notifi_too_late'

new_df.drop(['Notification_period'], inplace = True, axis = 1)
new_df['new_Notification_period'].value_counts()

In [None]:
new_df['Inception_to_loss'].value_counts(), new_df['Inception_to_loss'].max()

In [None]:
new_df['new_Inception_to_loss'] = 0

new_df.loc[(new_df['Inception_to_loss'] >= 0)   & (new_df['Inception_to_loss'] < 30), 'new_Inception_to_loss'] = '1m_Inception_to_loss'
new_df.loc[(new_df['Inception_to_loss'] >= 30)  & (new_df['Inception_to_loss'] < 60), 'new_Inception_to_loss'] = '2m_Inception_to_loss'
new_df.loc[(new_df['Inception_to_loss'] >= 60)  & (new_df['Inception_to_loss'] < 90), 'new_Inception_to_loss'] = '3m_Inception_to_loss'
new_df.loc[(new_df['Inception_to_loss'] >= 90)  & (new_df['Inception_to_loss'] < 120), 'new_Inception_to_loss'] = '4m_Inception_to_loss'
new_df.loc[(new_df['Inception_to_loss'] >= 120) & (new_df['Inception_to_loss'] < 150), 'new_Inception_to_loss'] = '5m_Inception_to_loss'
new_df.loc[(new_df['Inception_to_loss'] >= 150) & (new_df['Inception_to_loss'] < 180), 'new_Inception_to_loss'] = '6m_Inception_to_loss'
new_df.loc[(new_df['Inception_to_loss'] >= 180) & (new_df['Inception_to_loss'] < 210), 'new_Inception_to_loss'] = '7m_Inception_to_loss'
new_df.loc[(new_df['Inception_to_loss'] >= 210) & (new_df['Inception_to_loss'] < 240), 'new_Inception_to_loss'] = '8m_Inception_to_loss'
new_df.loc[(new_df['Inception_to_loss'] >= 240) & (new_df['Inception_to_loss'] < 270), 'new_Inception_to_loss'] = '9m_Inception_to_loss'
new_df.loc[(new_df['Inception_to_loss'] >= 270) & (new_df['Inception_to_loss'] < 300), 'new_Inception_to_loss'] = '10m_Inception_to_loss'
new_df.loc[(new_df['Inception_to_loss'] >= 300) & (new_df['Inception_to_loss'] < 330), 'new_Inception_to_loss'] = '11m_Inception_to_loss'
new_df.loc[(new_df['Inception_to_loss'] >= 330) & (new_df['Inception_to_loss'] < 370), 'new_Inception_to_loss'] = '12m_Inception_to_loss'

new_df.drop(['Inception_to_loss'], inplace = True, axis = 1)
new_df['new_Inception_to_loss'].value_counts()

In [None]:
new_df['Time_hour'].value_counts(), new_df['Time_hour'].max()

In [None]:
new_df['new_Time_hour'] = 0
new_df.loc[(new_df['Time_hour'] >= 0)  & (new_df['Time_hour'] < 4), 'new_Time_hour'] = 'Time_hour_too_early'
new_df.loc[(new_df['Time_hour'] >= 4)  & (new_df['Time_hour'] < 9), 'new_Time_hour'] = 'Time_hour_early'
new_df.loc[(new_df['Time_hour'] >= 9)  & (new_df['Time_hour'] < 14), 'new_Time_hour'] = 'Time_hour_moderate'
new_df.loc[(new_df['Time_hour'] >= 14) & (new_df['Time_hour'] < 19), 'new_Time_hour'] = 'Time_hour_late'
new_df.loc[(new_df['Time_hour'] >= 19) & (new_df['Time_hour'] < 24), 'new_Time_hour'] = 'Time_hour_too_late'

new_df.drop(['Time_hour'], inplace = True, axis = 1)
new_df['new_Time_hour'].value_counts()

# **Encoding**

In [None]:
new_df.info()

In [None]:
cols_ = [i for i in new_df.columns if new_df[i].dtype == 'O']

In [None]:
def one_hot_encoder(dataframe, categorical_cols,drop_first=False):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
    return dataframe
new_df = one_hot_encoder(new_df, cols_, drop_first=True)

In [None]:
new_df.info()

In [None]:
new_df.shape

# **Model**

In [None]:
import re
new_df = new_df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [None]:
y = new_df["Incurred"]
X = new_df.drop(["Incurred"], axis=1)

y.shape, X.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

In [None]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

## **LGBM**

In [None]:
lgb_model = LGBMRegressor().fit(X_train, y_train)
y_pred = lgb_model.predict(X_test)
mean_absolute_error(y_test, y_pred),np.sqrt(mean_squared_error(y_test, y_pred))



### **Model Tuning**


In [None]:
lgb_model = LGBMRegressor()

In [None]:
lgbm_params = {"learning_rate": [0.001,0.02, 0.2],
               "n_estimators": [500, 1000,],
               "max_depth": [3, 5, 8, 10],               
               "colsample_bytree": [1, 0.8]}

lgbm_cv_model = GridSearchCV(lgb_model,
                             lgbm_params,
                             cv=5,
                             n_jobs=-1,
                             verbose=2,).fit(X_train, y_train)

In [None]:
lgbm_tuned = LGBMRegressor(**lgbm_cv_model.best_params_).fit(X_train, y_train)
y_pred = lgbm_tuned.predict(X_test)
mean_absolute_error(y_test, y_pred),np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
def plot_importance(model, features, num=len(X), save=False):
    feature_imp = pd.DataFrame({'Value': model.feature_importances_, 'Feature': features.columns})
    plt.figure(figsize=(15, 20))
    sns.set(font_scale=1)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value",
                                                                     ascending=False)[0:num])
    plt.title('Features')
    plt.tight_layout()
    plt.show()
    if save:
        plt.savefig('importances.png')

  
plot_importance(lgbm_tuned, X_train)

## **Random Forest**

In [None]:
rf_model = RandomForestRegressor(random_state=42).fit(X_train, y_train)

In [None]:
# test hatası
y_pred = rf_model.predict(X_test)
mean_absolute_error(y_test, y_pred),np.sqrt(mean_squared_error(y_test, y_pred))

### **Model Tuning**

In [None]:
rf_params = {"max_depth": [5, 8, None],
             "max_features": [3, 5, 15],
             "n_estimators": [200, 500],
             "min_samples_split": [2, 5, 8, 10]}

rf_model = RandomForestRegressor(random_state=42)
rf_cv_model = GridSearchCV(rf_model, rf_params, cv=5, n_jobs=-1, verbose=1).fit(X_train, y_train)


In [None]:
rf_tuned = RandomForestRegressor(**rf_cv_model.best_params_).fit(X_train, y_train)
y_pred = rf_tuned.predict(X_test)
mean_absolute_error(y_test, y_pred),np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
plot_importance(rf_tuned, X_train)