# Importing Relevant Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

# Loading The Data

In [2]:
pd.set_option('display.max_columns', None)
df = pd.read_csv("train.csv")
df

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,794,1,0.5,1,0,1,2,0.8,106,6,14,1222,1890,668,13,4,19,1,1,0,0
1996,1965,1,2.6,1,0,0,39,0.2,187,4,3,915,1965,2032,11,10,16,1,1,1,2
1997,1911,0,0.9,1,1,1,36,0.7,108,8,3,868,1632,3057,9,1,5,1,1,0,3
1998,1512,0,0.9,0,4,1,46,0.1,145,5,5,336,670,869,18,10,19,1,1,1,0


In [3]:
inputs = df.iloc[:,:-1]
target = df.iloc[:,-1]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2, random_state = 72)

# Preprocessing the Data

In [23]:
#Custom Transformer that extracts columns passed as argument to its constructor 
class FeatureSelector( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self, feature_names ):
        self._feature_names = feature_names 
    
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        return X[ self._feature_names ] 

In [53]:
#Custom transformer to standardize the continuous numerical values
class CustomNormalizer(BaseEstimator,TransformerMixin):
    
    #Class Constructor
    def __init__(self,features,copy=True,with_mean=True,with_std=True):
        self.features = features
        self.scaler = StandardScaler(copy,with_mean,with_std)
    
    #Return the fit features
    def fit( self, X, y = None ):
        return self

    # Custom transformer 
    def transform(self, X, y = None):
        categorical_feature_names = []
        numerical_feature_names = []

        unique_entries = pd.DataFrame(X[self.features].nunique(), columns = ['values']).T
        for feature in unique_entries.columns:
            if(unique_entries.loc["values", feature] == 2):
                categorical_feature_names.append(feature)
            else:
                numerical_feature_names.append(feature)

        numerical_features_scaled = self.scaler.fit_transform(X[numerical_feature_names])
        numerical_features = pd.DataFrame(numerical_features_scaled, columns = numerical_feature_names) 
        
        categorical_features = X.drop(numerical_feature_names, axis = 1)
        
        inputs = categorical_features.join(numerical_features)
#         inputs.dropna(inplace = True)
#         inputs.reset_index(drop = True, inplace = True)
        return inputs

In [58]:
pipeline = Pipeline( steps = [ 
                            ( 'custom_standardizer', CustomNormalizer(X_train.columns.values) ),
                             ( 'lr', LogisticRegression(multi_class = 'multinomial', C = 1, penalty = 'l1', solver = 'saga' ))
                             ])

In [60]:
pipeline.fit(inputs, target)

Pipeline(memory=None,
         steps=[('custom_standardizer',
                 CustomNormalizer(copy=None,
                                  features=array(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi'], dtype=object),
                                  with_mean=None, with_std=None)),
                ('lr',
                 LogisticRegression(C=1, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='multinomial', n_jobs=None,
                                    penalty='l1', random_state=None,
                                    solver='saga', tol=0.0001, verbose=0,
                                    warm_start=False))],
         

In [56]:
pipeline.score(inputs, target)

0.984