In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
from scipy.stats.mstats import winsorize
from sklearn.preprocessing import Binarizer ,LabelEncoder
from sklearn.preprocessing import LabelEncoder,MinMaxScaler,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,MinMaxScaler,StandardScaler
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.metrics import mean_squared_error,accuracy_score,mean_squared_log_error,r2_score

# Model on Train Data


In [2]:
#loading train data
Train_df=pd.read_csv('../Data/Train.csv')

In [3]:
Train_df.head()

Unnamed: 0,ID,account_type,gender,age,region_code,cc_cons_apr,dc_cons_apr,cc_cons_may,dc_cons_may,cc_cons_jun,...,debit_count_may,max_credit_amount_may,debit_amount_jun,credit_amount_jun,credit_count_jun,debit_count_jun,max_credit_amount_jun,loan_enq,emi_active,cc_cons
0,12554,current,M,35,708,24893.0,378.0,10288.0,29664.0,16291.4,...,40,24563.0,30051.32,12761.0,2,65,50836.0,Y,1674.09,20014
1,17645,current,M,35,123,18941.62,966.0,20672.0,287.0,4217.0,...,78,23948.0,42119.05,76206.0,21,63,23226.0,Y,13043.34,10173
2,7604,current,M,55,802,5678.87,2724.0,1964.5,3933.11,23956.25,...,82,10752.0,58373.25,75283.0,7,1,27036.0,Y,25375.27,16095
3,1590,current,F,29,867,30489.5,1236.0,12609.88,9138.14,17521.0,...,38,12607.0,69851.51,68708.0,21,83,43037.0,Y,3544.33,7707
4,16556,current,M,34,802,7441.4,6906.04,4364.0,1939.0,2121.0,...,50,25213.0,10071.0,80140.0,8,32,32044.0,Y,12780.44,96408


In [4]:
# Dropping unneccessary columns
Train_df=Train_df.drop(['ID','personal_loan_active','vehicle_loan_active','personal_loan_closed','vehicle_loan_closed','loan_enq'],axis=1)

In [5]:
# coverting Categorical data to int

Train_df['account_type']=pd.get_dummies(Train_df['account_type'])

Train_df['gender']=pd.get_dummies(Train_df['gender'])


In [6]:
# Outliers Function to check outliers in dataframe

def check_outliers(df):
    col = list(df)
    outliers = pd.DataFrame(columns=['columns','Outliers'])
    
    for column in col:
        if column in df.select_dtypes(include=np.number).columns:
            q1 = df[column].quantile(0.25) 
            q3 = df[column].quantile(0.75)
            below = q1 - (1.5*q3 - q1)
            above = q3 + (1.5*q3 - q1)
            outliers = outliers.append({'columns':column,'Outliers':df.loc[(df[column] < below) | (df[column] > above)].shape[0]},ignore_index=True)
    return outliers

In [7]:
check_outliers(Train_df)

Unnamed: 0,columns,Outliers
0,account_type,2273
1,gender,2055
2,age,534
3,region_code,0
4,cc_cons_apr,1131
5,dc_cons_apr,1146
6,cc_cons_may,1113
7,dc_cons_may,1184
8,cc_cons_jun,1096
9,dc_cons_jun,1259


In [8]:
# Dfine independent and target varible 
X= Train_df.iloc[:,:-1]

y= Train_df.cc_cons


In [9]:
#log Transformation to normilize the value of target variable
y = np.log1p(y)

In [10]:
# seprate the num columns and cat data columns
X_cols = X.columns
num_cols = X.select_dtypes(exclude=['object','category']).columns
cat_cols = [i for i in X_cols if i not in X[num_cols].columns]
for i in cat_cols:
    X[i] = X[i].astype('category')

In [11]:
#function for removing outliers
def removing_outliers(dataframe):
    cols = list(dataframe)
    for col in cols:
        if col in dataframe.select_dtypes(include=np.number).columns:
            dataframe[col] = winsorize(dataframe[col], limits=[0.1, 0.1],inclusive=(True, True))
    
    return dataframe

In [12]:
X[num_cols] = removing_outliers(X[num_cols])

check_outliers(X)

Unnamed: 0,columns,Outliers
0,account_type,2273
1,gender,2055
2,age,0
3,region_code,0
4,cc_cons_apr,0
5,dc_cons_apr,0
6,cc_cons_may,0
7,dc_cons_may,0
8,cc_cons_jun,0
9,dc_cons_jun,0


In [13]:
scaler = StandardScaler()

X = scaler.fit_transform(X)
X = pd.DataFrame(X, columns=X_cols)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=72)


In [15]:
def rmsle(actual_column, predicted_column):
    sum=0.0
    for x,y in zip(actual_column,predicted_column):
        if x<0 or y<0: #check for negative values. 
            continue
        p = np.log(y+1)
        r = np.log(x+1)
        sum = sum + (p - r)**2
    return (sum/len(predicted_column))**0.5

In [16]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [17]:
Train_rmse=np.sqrt(mean_squared_error(y_train, lr.predict(X_train)))
Train_Test_rmse=np.sqrt(mean_squared_error(y_test, lr.predict(X_test)))
Train_rmse,Train_Test_rmse
print('Rmse Train data',Train_rmse)
print('Rmse Test data',Train_Test_rmse)

Rmse Train data 1.6077337546779327
Rmse Test data 1.6152702838307542


In [18]:
print('RMSLE',rmsle(y_test,y_pred))

RMSLE 0.15997909173272234


# Test Data

In [20]:
Test_df=pd.read_csv('../Data/Test.csv')


In [21]:
Test_df.head()

Unnamed: 0,ID,account_type,gender,age,region_code,cc_cons_apr,dc_cons_apr,cc_cons_may,dc_cons_may,cc_cons_jun,...,credit_count_may,debit_count_may,max_credit_amount_may,debit_amount_jun,credit_amount_jun,credit_count_jun,debit_count_jun,max_credit_amount_jun,loan_enq,emi_active
0,17591,current,M,37,575,2795.45,1290.0,6141.05,676.5,9146.1,...,24.0,20.0,9750.0,30527.88,31271.0,3.0,12.0,25945.92,Y,1170.49
1,13541,current,M,33,394,29405.13,1640.0,1670.0,2463.92,10947.5,...,43.0,40.0,16967.0,14342.83,16582.0,6.0,39.0,12214.0,Y,16447.45
2,13431,current,M,53,324,5985.2,6189.0,2696.09,766.0,716.0,...,42.0,35.0,36398.0,32503.16,33539.54,5.0,9.0,13215.0,Y,2622.28
3,8687,current,M,33,370,2105.93,18225.0,34763.56,167.0,4260.27,...,3.0,5.0,97825.0,93572.42,109429.75,21.0,0.0,72317.0,Y,340.79
4,14727,current,M,62,505,3269.0,3532.0,3158.4,2699.77,3373.48,...,70.0,75.0,27936.0,19011.5,41401.0,6.0,35.0,42344.0,Y,2812.2


In [22]:
#check outliers of test data
check_outliers(Test_df)

Unnamed: 0,columns,Outliers
0,ID,0
1,age,172
2,region_code,0
3,cc_cons_apr,374
4,dc_cons_apr,369
5,cc_cons_may,378
6,dc_cons_may,396
7,cc_cons_jun,370
8,dc_cons_jun,468
9,cc_count_apr,283


In [23]:
Test_df[num_cols] = removing_outliers(Test_df[num_cols])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [24]:
check_outliers(Test_df)

Unnamed: 0,columns,Outliers
0,ID,0
1,age,0
2,region_code,0
3,cc_cons_apr,0
4,dc_cons_apr,0
5,cc_cons_may,0
6,dc_cons_may,0
7,cc_cons_jun,0
8,dc_cons_jun,0
9,cc_count_apr,0


In [25]:
Test_Id = Test_df['ID']

In [26]:
Test_df = Test_df.drop(['ID'],axis=1)

In [27]:
Test_df['account_type']=pd.get_dummies(Test_df['account_type'])

Test_df['gender']=pd.get_dummies(Test_df['gender'])

In [28]:
Test_df=Test_df.drop(['personal_loan_active','personal_loan_closed','vehicle_loan_active','vehicle_loan_closed','loan_enq'],axis=1)

In [29]:
Test_df_col = Test_df.columns

In [30]:
Test_df = scaler.transform(Test_df)
Test_df = pd.DataFrame(Test_df, columns=Test_df_col)

In [31]:
Test_df['cc_cons'] = lr.predict(Test_df)

In [32]:
Test_df['cc_cons'] = np.exp(Test_df['cc_cons'])-1

In [35]:
sub_lr = pd.concat([Test_Id, Test_df['cc_cons']], axis=1) 

In [36]:
sub_lr.to_csv('sub_Lr.csv', index=False)

In [37]:
submissions_lr

Unnamed: 0,ID,cc_cons
0,17591,13532.214842
1,13541,15747.052028
2,13431,13402.686237
3,8687,15903.905753
4,14727,13668.601380
5,14988,14595.174742
6,14859,17238.501660
7,16636,15238.365072
8,7625,15006.549819
9,16492,14734.946002
