In [2]:
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

class Classification_Model:
    
    def __init__(self, df):
        self.df = df
        
    def target_variable(self, y_var_string):
        
        if y_var_string in list(self.df):
            self.y_var = y_var_string
            
            x_var = list(self.df)
            x_var.remove(self.y_var)
            
            self.x_var = x_var
        else:
            print("Y variable not in df")
            
    def select_k_best(self, K):
        
        X_columns = SelectKBest(f_classif, k = K).fit(self.df[self.x_var], self.df[self.y_var])
        indices = X_columns.get_support(indices=True)
        d = list(self.df[self.x_var])
        column_names = []
    
        for k in range(len(indices)):
            column_names.append(d[indices[k]])
            
        self.x_var = column_names
        
    def label_encode(self):
        
        label_encoder = LabelEncoder()
        
        df = self.df
        
        columns = list(df)

        for i in range(len(columns)):
            
            df[columns[i]] = label_encoder.fit_transform(df[columns[i]].astype(str))
            
        self.df = df

    def remove_x_column(self, column_name_string):
        
        x_var = self.x_var
        x_var.remove(column_name_string)
        self.x_var = x_var
        
    def show_x_columns(self):
        
        print(self.x_var)
        
    def show_y_columns(self):
        
        print(self.y_var)
        
    def make_train_test_split(self, test_size):
        
        x_train, x_test, y_train, y_test = train_test_split(self.df[self.x_var], 
                                                            self.df[self.y_var],
                                                           test_size = test_size)
        
        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train
        self.y_test = y_test
        
    def make_model(self, model_type):
        
        self.model_type = model_type
        
        if model_type == 'rf':

            model = RandomForestClassifier(n_estimators = 200)

            model.fit(self.x_train, self.y_train)

            self.model = model
            
        elif model_type == 'lr':
            
            model = LogisticRegression(penalty = 'l2')
            
            model.fit(self.x_train, self.y_train)

            self.model = model
            
        elif model_type == 'gbc':
            
            model = GradientBoostingClassifier(n_estimators = 500)
            
            model.fit(self.x_train, self.y_train)

            self.model = model
            
        else:
            print('Sorry, don\'t know that one')
        
    def model_get_feature_importances(self):
        
        if self.model_type in ['rf','gbr']:
        
            feature_importance = pd.DataFrame()

            feature_importance['feature'] = self.x_var
            feature_importance['importance'] = list(self.model.feature_importances_)

            return feature_importance.sort_values(by = ['importance'], ascending = False)
        
        else:
             print('That feature is not availble for this model')
        
    def model_get_confusion_matrix(self):
        
        y_pred = self.model.predict(self.x_test)
        
        self.y_pred = y_pred
        
        return pd.crosstab(self.y_test, self.y_pred)
    
    def model_get_accuracy(self):
        
        return self.model.score(self.x_test, self.y_test)

In [3]:
df = pd.read_csv('winemag-data_first150k.csv')

In [4]:
z = Classification_Model(df)

In [6]:
z.target_variable('country')

In [7]:
z.y_var

'country'

In [8]:
z.remove_x_column('Unnamed: 0')

In [9]:
z.x_var

['description',
 'designation',
 'points',
 'price',
 'province',
 'region_1',
 'region_2',
 'variety',
 'winery']

In [10]:
z.label_encode()

In [11]:
z.x_var

['description',
 'designation',
 'points',
 'price',
 'province',
 'region_1',
 'region_2',
 'variety',
 'winery']

In [12]:
z.select_k_best(3)

In [13]:
z.x_var

['province', 'region_1', 'region_2']

In [14]:
z.make_train_test_split(0.2)

In [16]:
z.make_model('rf')

In [17]:
z.model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [18]:
z.model_get_feature_importances()

Unnamed: 0,feature,importance
0,province,0.526595
2,region_2,0.25847
1,region_1,0.214935


In [19]:
z.model_get_accuracy()

0.9996687205989532

In [20]:
z.model_get_confusion_matrix()

col_0,0,1,2,3,4,5,6,7,8,10,...,34,35,36,37,38,39,40,43,44,47
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1110,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,965,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,618,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,16,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,41,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,1129,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,17,...,0,0,0,0,0,0,0,0,0,0
