In [171]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pydataset import data
from sklearn import svm
from sklearn import model_selection
from statsmodels.tools import eval_measures
from statsmodels.tools.eval_measures import mse
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score,classification_report
%matplotlib inline

In [114]:
df = pd.DataFrame(data('OFP'))

In [115]:
df.head()

Unnamed: 0,ofp,ofnp,opp,opnp,emr,hosp,numchron,adldiff,age,black,sex,maried,school,faminc,employed,privins,medicaid,region,hlth
1,5,0,0,0,0,1,2,0,6.9,yes,male,yes,6,2.881,yes,yes,no,other,other
2,1,0,2,0,2,0,2,0,7.4,no,female,yes,10,2.7478,no,yes,no,other,other
3,13,0,0,0,3,3,4,1,6.6,yes,female,no,10,0.6532,no,no,yes,other,poor
4,16,0,5,0,1,1,2,1,7.6,no,male,yes,3,0.6588,no,yes,no,other,poor
5,3,0,0,0,0,0,2,1,7.9,no,female,yes,6,0.6588,no,yes,no,other,other


In [116]:
df.shape

(4406, 19)

In [117]:
# df.columns

In [118]:
# df.isnull().sum()

In [119]:
# df.describe()

In [120]:
df.nunique()

ofp           60
ofnp          51
opp           37
opnp          35
emr           11
hosp           9
numchron       9
adldiff        2
age           36
black          2
sex            2
maried         2
school        19
faminc      3015
employed       2
privins        2
medicaid       2
region         4
hlth           3
dtype: int64

In [121]:
df.dropna()
df.shape   # there was no missing  values

(4406, 19)

In [122]:
label_encoder = LabelEncoder()
df['Black'] = label_encoder.fit_transform(df['black'])
df['Mariedd'] = label_encoder.fit_transform(df['maried'])
df['sexx'] = label_encoder.fit_transform(df['sex'])
df['emplstatus'] = label_encoder.fit_transform(df['employed'])

In [123]:
df.head()

Unnamed: 0,ofp,ofnp,opp,opnp,emr,hosp,numchron,adldiff,age,black,...,faminc,employed,privins,medicaid,region,hlth,Black,Mariedd,sexx,emplstatus
1,5,0,0,0,0,1,2,0,6.9,yes,...,2.881,yes,yes,no,other,other,1,1,1,1
2,1,0,2,0,2,0,2,0,7.4,no,...,2.7478,no,yes,no,other,other,0,1,0,0
3,13,0,0,0,3,3,4,1,6.6,yes,...,0.6532,no,no,yes,other,poor,1,0,0,0
4,16,0,5,0,1,1,2,1,7.6,no,...,0.6588,no,yes,no,other,poor,0,1,1,0
5,3,0,0,0,0,0,2,1,7.9,no,...,0.6588,no,yes,no,other,other,0,1,0,0


In [124]:
print(df.columns)


Index(['ofp', 'ofnp', 'opp', 'opnp', 'emr', 'hosp', 'numchron', 'adldiff',
       'age', 'black', 'sex', 'maried', 'school', 'faminc', 'employed',
       'privins', 'medicaid', 'region', 'hlth', 'Black', 'Mariedd', 'sexx',
       'emplstatus'],
      dtype='object')


In [125]:
df= df.drop(['black', 'sex', 'maried','employed',
       'privins', 'medicaid', 'region', 'hlth'],axis=1)

In [126]:
print(df.columns)
df.head(3)

Index(['ofp', 'ofnp', 'opp', 'opnp', 'emr', 'hosp', 'numchron', 'adldiff',
       'age', 'school', 'faminc', 'Black', 'Mariedd', 'sexx', 'emplstatus'],
      dtype='object')


Unnamed: 0,ofp,ofnp,opp,opnp,emr,hosp,numchron,adldiff,age,school,faminc,Black,Mariedd,sexx,emplstatus
1,5,0,0,0,0,1,2,0,6.9,6,2.881,1,1,1,1
2,1,0,2,0,2,0,2,0,7.4,10,2.7478,0,1,0,0
3,13,0,0,0,3,3,4,1,6.6,10,0.6532,1,0,0,0


### Scaaling of Variables

2 methods can be used:

    1. df = (df-df.min())/(df.max()-df.min())
    
    2. # Create a MinMaxScaler object
        scaler = MinMaxScaler()

        #Scale the variable(s)
        scaled_data = scaler.fit_transform(df)
        
        #Update the original DataFrame with scaled values
        df.loc[:, :] = scaled_data

In [129]:
# Create a MinMaxScaler object
scaler = MinMaxScaler()

# Scale the variable(s)
scaled_data = scaler.fit_transform(df)

# Update the original DataFrame with scaled values
df.loc[:, :] = scaled_data
df.head(3)

Unnamed: 0,ofp,ofnp,opp,opnp,emr,hosp,numchron,adldiff,age,school,faminc,Black,Mariedd,sexx,emplstatus
1,0.05618,0.0,0.0,0.0,0.0,0.125,0.25,0.0,0.069767,0.333333,0.069717,1.0,1.0,1.0,1.0
2,0.011236,0.0,0.014184,0.0,0.166667,0.0,0.25,0.0,0.186047,0.555556,0.067331,0.0,1.0,0.0,0.0
3,0.146067,0.0,0.0,0.0,0.25,0.375,0.5,1.0,0.0,0.555556,0.029826,1.0,0.0,0.0,0.0


In [130]:
df.columns

Index(['ofp', 'ofnp', 'opp', 'opnp', 'emr', 'hosp', 'numchron', 'adldiff',
       'age', 'school', 'faminc', 'Black', 'Mariedd', 'sexx', 'emplstatus'],
      dtype='object')

# Model Development


### Creating independent Variable

In [139]:
X= df[['ofp', 'ofnp', 'opp', 'opnp', 'emr', 'hosp', 'numchron', 'adldiff',
       'age', 'school', 'faminc', 'Black', 'sexx', 'emplstatus']]
y=df[ 'Mariedd']

In [140]:
print(y.head(2))


1    1.0
2    1.0
Name: Mariedd, dtype: float64


### Test and Train Sets 

In [141]:
X_train, X_test,y_train, y_test =model_selection.train_test_split(X,y, test_size=0.3, random_state=1) 

In [142]:
print(X_train.shape)
print(X_test.shape)

(3084, 14)
(1322, 14)


### Models

In [173]:
model = svm.LinearSVC()  # C is the hyper Parameter

##### Fitting the Model

In [174]:
model.fit(X_train,y_train)

LinearSVC()

##### Checking the Accuracy of the model

In [175]:
model.score(X_train,y_train)

0.7487029831387808

#  MODEL TESTING

In [168]:
ypred=model.predict(X_test)

# model score with the data outside of its knowledge
accuracy_score(y_test,ypred)

0.7216338880484114

#### Cross Validation

In [170]:
pd.crosstab(y_test,ypred)

col_0,0.0,1.0
Mariedd,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,441,171
1.0,197,513


In [172]:
print(classification_report(y_test,ypred))

              precision    recall  f1-score   support

         0.0       0.69      0.72      0.71       612
         1.0       0.75      0.72      0.74       710

    accuracy                           0.72      1322
   macro avg       0.72      0.72      0.72      1322
weighted avg       0.72      0.72      0.72      1322

