# Classification Problem

In [2]:
from warnings import filterwarnings
filterwarnings('ignore')

### import Dataset

In [3]:
import pandas as pd
df = pd.read_csv('Datasets/personality_datasert.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2900 entries, 0 to 2899
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Time_spent_Alone           2900 non-null   float64
 1   Stage_fear                 2900 non-null   object 
 2   Social_event_attendance    2900 non-null   float64
 3   Going_outside              2900 non-null   float64
 4   Drained_after_socializing  2900 non-null   object 
 5   Friends_circle_size        2900 non-null   float64
 6   Post_frequency             2900 non-null   float64
 7   Personality                2900 non-null   object 
dtypes: float64(5), object(3)
memory usage: 181.4+ KB


### split data 

In [4]:
X = df.drop(columns=['Personality'])
Y = df[['Personality']]

In [5]:
X.head()

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
0,4.0,No,4.0,6.0,No,13.0,5.0
1,9.0,Yes,0.0,0.0,Yes,0.0,3.0
2,9.0,Yes,1.0,2.0,Yes,5.0,2.0
3,0.0,No,6.0,7.0,No,14.0,8.0
4,3.0,No,9.0,4.0,No,8.0,5.0


In [6]:
Y.head()

Unnamed: 0,Personality
0,Extrovert
1,Introvert
2,Introvert
3,Extrovert
4,Extrovert


### Seperate the catogrical and continuous features

In [7]:
from PM6func import catconsep
cat, con = catconsep(X)
cat

['Stage_fear', 'Drained_after_socializing']

In [8]:
con

['Time_spent_Alone',
 'Social_event_attendance',
 'Going_outside',
 'Friends_circle_size',
 'Post_frequency']

### Preprocess the data

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OrdinalEncoder
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline(steps=[('impute', SimpleImputer(strategy='mean')),
                              ('scaler',StandardScaler())])
cat_pipeline = Pipeline(steps=[('impute',SimpleImputer(strategy='most_frequent')),
                               ('ordinal',OrdinalEncoder())])
Pre = ColumnTransformer([('num',num_pipeline,con),
                         ('con',cat_pipeline,cat)])
Pre

0,1,2
,transformers,"[('num', ...), ('con', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,


In [10]:
X_pre =Pre.fit_transform(X)
X_pre

array([[-0.14701445,  0.01275908,  1.35061257, ...,  0.49610435,
         0.        ,  0.        ],
       [ 1.30622592, -1.37994399, -1.35061257, ..., -0.19519859,
         1.        ,  1.        ],
       [ 1.30622592, -1.03176822, -0.45020419, ..., -0.54085006,
         1.        ,  1.        ],
       ...,
       [-0.14701445, -1.03176822, -0.90040838, ..., -1.23215301,
         1.        ,  1.        ],
       [ 1.88752207, -1.03176822,  0.        , ..., -1.23215301,
         1.        ,  1.        ],
       [-0.43766252,  0.70911062,  1.35061257, ...,  1.87871024,
         0.        ,  0.        ]], shape=(2900, 7))

In [11]:
cols = Pre.get_feature_names_out()
cols

array(['num__Time_spent_Alone', 'num__Social_event_attendance',
       'num__Going_outside', 'num__Friends_circle_size',
       'num__Post_frequency', 'con__Stage_fear',
       'con__Drained_after_socializing'], dtype=object)

In [12]:
X_pre = pd.DataFrame(X_pre,columns=cols)
X_pre

Unnamed: 0,num__Time_spent_Alone,num__Social_event_attendance,num__Going_outside,num__Friends_circle_size,num__Post_frequency,con__Stage_fear,con__Drained_after_socializing
0,-0.147014,0.012759,1.350613,1.590680,0.496104,0.0,0.0
1,1.306226,-1.379944,-1.350613,-1.481437,-0.195199,1.0,1.0
2,1.306226,-1.031768,-0.450204,-0.299853,-0.540850,1.0,1.0
3,-1.309607,0.709111,1.800817,1.826996,1.533059,0.0,0.0
4,-0.437663,1.753638,0.450204,0.409096,0.496104,0.0,0.0
...,...,...,...,...,...,...,...
2895,-0.437663,1.057286,1.350613,-0.063537,0.841756,0.0,0.0
2896,-0.437663,1.405462,0.000000,1.826996,1.878710,0.0,0.0
2897,-0.147014,-1.031768,-0.900408,-0.536170,-1.232153,1.0,1.0
2898,1.887522,-1.031768,0.000000,-1.008803,-1.232153,1.0,1.0


### Train Test split
- 80% Train data
- 20% Test data

In [13]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest= train_test_split(X_pre,Y, test_size=0.2 ,random_state=21)
xtrain.shape

(2320, 7)

### Train Model
- Logistic regression
- Decision Tree

In [14]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(xtrain,ytrain)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [15]:
model.score(xtrain,ytrain)

0.9314655172413793

In [16]:
model.score(xtest,ytest)

0.9137931034482759

In [17]:
ypred_tr = model.predict(xtrain)
ypred_tr[0:5]

array(['Introvert', 'Introvert', 'Introvert', 'Introvert', 'Introvert'],
      dtype=object)

In [18]:
ytrain.head()

Unnamed: 0,Personality
1710,Introvert
802,Extrovert
1601,Introvert
1225,Introvert
546,Introvert


In [19]:
ypred_ts = model.predict(xtest)
ypred_ts[0:5]

array(['Introvert', 'Introvert', 'Introvert', 'Introvert', 'Extrovert'],
      dtype=object)

In [20]:
ytest.head()

Unnamed: 0,Personality
294,Introvert
2619,Introvert
766,Introvert
1210,Introvert
1741,Extrovert


In [21]:
from sklearn.metrics import classification_report
print(classification_report(ypred_tr,ytrain))

              precision    recall  f1-score   support

   Extrovert       0.93      0.94      0.93      1173
   Introvert       0.94      0.92      0.93      1147

    accuracy                           0.93      2320
   macro avg       0.93      0.93      0.93      2320
weighted avg       0.93      0.93      0.93      2320



In [22]:
from sklearn.metrics import classification_report
print(classification_report(ypred_ts,ytest))

              precision    recall  f1-score   support

   Extrovert       0.92      0.91      0.92       305
   Introvert       0.91      0.91      0.91       275

    accuracy                           0.91       580
   macro avg       0.91      0.91      0.91       580
weighted avg       0.91      0.91      0.91       580



In [23]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(max_depth=1, min_samples_split=5, min_samples_leaf=3, criterion='entropy')
dtc.fit(xtrain,ytrain)


0,1,2
,criterion,'entropy'
,splitter,'best'
,max_depth,1
,min_samples_split,5
,min_samples_leaf,3
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [24]:
dtc.score(xtrain,ytrain)

0.9275862068965517

In [25]:
dtc.score(xtest,ytest)

0.9086206896551724

In [26]:
dtcpred_tr = dtc.predict(xtrain)
dtcpred_tr[0:5]

array(['Introvert', 'Introvert', 'Introvert', 'Introvert', 'Introvert'],
      dtype=object)

In [27]:
ytrain.head()

Unnamed: 0,Personality
1710,Introvert
802,Extrovert
1601,Introvert
1225,Introvert
546,Introvert


In [28]:
dtcpred_ts = dtc.predict(xtest)
dtcpred_ts[0:5]

array(['Introvert', 'Introvert', 'Introvert', 'Introvert', 'Extrovert'],
      dtype=object)

In [29]:
ytest.head()

Unnamed: 0,Personality
294,Introvert
2619,Introvert
766,Introvert
1210,Introvert
1741,Extrovert
