# **DATA PREPROCESSING**

In [64]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [65]:
data=pd.read_csv('fertility_Diagnosis.txt')
data

Unnamed: 0,Season,Age(18-36),ChildishDiseases,Accidents,SurgicalIntervention,HighFevers,FreqAlcohol,Smoking,HoursSit,Output
0,-0.33,0.69,0,1,1,0,0.8,0,0.88,N
1,-0.33,0.94,1,0,1,0,0.8,1,0.31,O
2,-0.33,0.50,1,0,0,0,1.0,-1,0.50,N
3,-0.33,0.75,0,1,1,0,1.0,-1,0.38,N
4,-0.33,0.67,1,1,0,0,0.8,-1,0.50,O
...,...,...,...,...,...,...,...,...,...,...
95,-1.00,0.67,1,0,0,0,1.0,-1,0.50,N
96,-1.00,0.61,1,0,0,0,0.8,0,0.50,N
97,-1.00,0.67,1,1,1,0,1.0,-1,0.31,N
98,-1.00,0.64,1,0,1,0,1.0,0,0.19,N


In [66]:
data.columns

Index(['Season', 'Age(18-36)', 'ChildishDiseases', 'Accidents',
       'SurgicalIntervention', 'HighFevers', 'FreqAlcohol', 'Smoking',
       'HoursSit', 'Output'],
      dtype='object')

In [67]:
data.drop(columns='Season',axis=1,inplace=True)
data.head()

Unnamed: 0,Age(18-36),ChildishDiseases,Accidents,SurgicalIntervention,HighFevers,FreqAlcohol,Smoking,HoursSit,Output
0,0.69,0,1,1,0,0.8,0,0.88,N
1,0.94,1,0,1,0,0.8,1,0.31,O
2,0.5,1,0,0,0,1.0,-1,0.5,N
3,0.75,0,1,1,0,1.0,-1,0.38,N
4,0.67,1,1,0,0,0.8,-1,0.5,O


In [68]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Age(18-36)            100 non-null    float64
 1   ChildishDiseases      100 non-null    int64  
 2   Accidents             100 non-null    int64  
 3   SurgicalIntervention  100 non-null    int64  
 4   HighFevers            100 non-null    int64  
 5   FreqAlcohol           100 non-null    float64
 6   Smoking               100 non-null    int64  
 7   HoursSit              100 non-null    float64
 8   Output                100 non-null    object 
dtypes: float64(3), int64(5), object(1)
memory usage: 7.2+ KB


In [69]:
from sklearn.preprocessing import LabelEncoder
data['Output_enc']=LabelEncoder().fit_transform(data[['Output']])
data

  return f(*args, **kwargs)


Unnamed: 0,Age(18-36),ChildishDiseases,Accidents,SurgicalIntervention,HighFevers,FreqAlcohol,Smoking,HoursSit,Output,Output_enc
0,0.69,0,1,1,0,0.8,0,0.88,N,0
1,0.94,1,0,1,0,0.8,1,0.31,O,1
2,0.50,1,0,0,0,1.0,-1,0.50,N,0
3,0.75,0,1,1,0,1.0,-1,0.38,N,0
4,0.67,1,1,0,0,0.8,-1,0.50,O,1
...,...,...,...,...,...,...,...,...,...,...
95,0.67,1,0,0,0,1.0,-1,0.50,N,0
96,0.61,1,0,0,0,0.8,0,0.50,N,0
97,0.67,1,1,1,0,1.0,-1,0.31,N,0
98,0.64,1,0,1,0,1.0,0,0.19,N,0


In [70]:
data.dtypes

Age(18-36)              float64
ChildishDiseases          int64
Accidents                 int64
SurgicalIntervention      int64
HighFevers                int64
FreqAlcohol             float64
Smoking                   int64
HoursSit                float64
Output                   object
Output_enc                int32
dtype: object

## Prepare train and test dataset

In [71]:
#creating our X feature variables and y outcome variable
X = data.drop('Output',axis=1)
y = data['Output_enc']

In [72]:
#splitting our data into training and testing data set
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2)

# **Random Forest Classification**

### train_array(X_train) = train dataset WITHOUT occupancy

### test_array(X_test) = test dataset WITHOUT occupancy
### occlabel_array(y_train) = 'Occupancy' label from train dataset

In [73]:
#Random Forest Classification
from sklearn.ensemble import RandomForestClassifier

clf=RandomForestClassifier()
clf.fit(X_train,y_train)


RandomForestClassifier()

In [74]:
#To predict occupancy using the test's set features
clf_pred=clf.predict(X_test)
print(clf_pred)

[0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0]


In [75]:
# Transfer the array of predicted popularity of test data into dataframe
clf_pred_df = pd.DataFrame(data=clf_pred, columns=['Output_enc'])
clf_pred_df.head(10)

Unnamed: 0,Output_enc
0,0
1,0
2,0
3,0
4,0
5,1
6,0
7,1
8,0
9,0


In [76]:
RFprob_each = clf.predict_proba(X_test)
RFprob_each

array([[1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.21, 0.79],
       [1.  , 0.  ],
       [0.22, 0.78],
       [0.99, 0.01],
       [0.97, 0.03],
       [0.95, 0.05],
       [0.98, 0.02],
       [0.93, 0.07],
       [0.87, 0.13],
       [0.91, 0.09],
       [0.93, 0.07],
       [0.99, 0.01],
       [0.06, 0.94],
       [1.  , 0.  ],
       [0.8 , 0.2 ]])

In [77]:
pos_prob = []
for inner in RFprob_each:
    pos_prob.append(inner[1])
pos_prob[0:10]


[0.0, 0.0, 0.0, 0.0, 0.0, 0.79, 0.0, 0.78, 0.01, 0.03]

In [78]:
data.head()

Unnamed: 0,Age(18-36),ChildishDiseases,Accidents,SurgicalIntervention,HighFevers,FreqAlcohol,Smoking,HoursSit,Output,Output_enc
0,0.69,0,1,1,0,0.8,0,0.88,N,0
1,0.94,1,0,1,0,0.8,1,0.31,O,1
2,0.5,1,0,0,0,1.0,-1,0.5,N,0
3,0.75,0,1,1,0,1.0,-1,0.38,N,0
4,0.67,1,1,0,0,0.8,-1,0.5,O,1


In [79]:
clf_pred_df

Unnamed: 0,Output_enc
0,0
1,0
2,0
3,0
4,0
5,1
6,0
7,1
8,0
9,0


In [80]:
# Insert result into test_wtpopl, check the prediction correctness
data.insert(len(data.columns), 'RForest', clf_pred_df['Output_enc'])

In [81]:
data

Unnamed: 0,Age(18-36),ChildishDiseases,Accidents,SurgicalIntervention,HighFevers,FreqAlcohol,Smoking,HoursSit,Output,Output_enc,RForest
0,0.69,0,1,1,0,0.8,0,0.88,N,0,0.0
1,0.94,1,0,1,0,0.8,1,0.31,O,1,0.0
2,0.50,1,0,0,0,1.0,-1,0.50,N,0,0.0
3,0.75,0,1,1,0,1.0,-1,0.38,N,0,0.0
4,0.67,1,1,0,0,0.8,-1,0.50,O,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...
95,0.67,1,0,0,0,1.0,-1,0.50,N,0,
96,0.61,1,0,0,0,0.8,0,0.50,N,0,
97,0.67,1,1,1,0,1.0,-1,0.31,N,0,
98,0.64,1,0,1,0,1.0,0,0.19,N,0,


# **Naive Bayes**

In [82]:
from sklearn.naive_bayes import GaussianNB

gaus = GaussianNB()
gaus.fit(X_train, y_train)

GaussianNB()

In [83]:
# To predict popularity using test set's features
NB_pred= gaus.predict(X_test)
print(NB_pred)

[0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0]


In [84]:
# Transfer the array of predicted popularity of test data into dataframe
NB_pred_df = pd.DataFrame(data=NB_pred, columns=['Output_enc'])
NB_pred_df.head(10)

Unnamed: 0,Output_enc
0,0
1,0
2,0
3,0
4,0
5,1
6,0
7,1
8,0
9,0


In [85]:
NBprob_each = gaus.predict_proba(X_test)
NBprob_each

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.]])

In [86]:
pos_prob = []
for inner in NBprob_each:
    pos_prob.append(inner[1])
pos_prob[0:10]


[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0]

In [87]:
# Insert result into test_wtpopl, check the prediction correctness
data.insert(len(data.columns), 'NBGaus', NB_pred_df['Output_enc'])

In [88]:
data.head()

Unnamed: 0,Age(18-36),ChildishDiseases,Accidents,SurgicalIntervention,HighFevers,FreqAlcohol,Smoking,HoursSit,Output,Output_enc,RForest,NBGaus
0,0.69,0,1,1,0,0.8,0,0.88,N,0,0.0,0.0
1,0.94,1,0,1,0,0.8,1,0.31,O,1,0.0,0.0
2,0.5,1,0,0,0,1.0,-1,0.5,N,0,0.0,0.0
3,0.75,0,1,1,0,1.0,-1,0.38,N,0,0.0,0.0
4,0.67,1,1,0,0,0.8,-1,0.5,O,1,0.0,0.0


# **KNN Classification**

In [89]:
from sklearn.neighbors import KNeighborsClassifier 

knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=7)

In [90]:
# To predict popularity using test set's features
KNN_pred= knn.predict(X_test)
print(KNN_pred)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]


In [91]:
# Transfer the array of predicted popularity of test data into dataframe
KNpredct_df = pd.DataFrame(data=KNN_pred, columns=['Output_enc'])
KNpredct_df.head(10)

Unnamed: 0,Output_enc
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


In [92]:
KNprob_each = knn.predict_proba(X_test)
KNprob_each

array([[1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.57142857, 0.42857143],
       [1.        , 0.        ],
       [0.85714286, 0.14285714],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.85714286, 0.14285714],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.42857143, 0.57142857],
       [0.85714286, 0.14285714],
       [1.        , 0.        ]])

In [93]:
pos_prob = []
for inner in KNprob_each:
    pos_prob.append(inner[1])
pos_prob[0:10]

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.42857142857142855,
 0.0,
 0.14285714285714285,
 0.0,
 0.0]

In [94]:
# Insert result into test_wtpopl, check the prediction correctness
data.insert(len(data.columns), 'KNeigh', KNpredct_df['Output_enc'])

In [95]:
data

Unnamed: 0,Age(18-36),ChildishDiseases,Accidents,SurgicalIntervention,HighFevers,FreqAlcohol,Smoking,HoursSit,Output,Output_enc,RForest,NBGaus,KNeigh
0,0.69,0,1,1,0,0.8,0,0.88,N,0,0.0,0.0,0.0
1,0.94,1,0,1,0,0.8,1,0.31,O,1,0.0,0.0,0.0
2,0.50,1,0,0,0,1.0,-1,0.50,N,0,0.0,0.0,0.0
3,0.75,0,1,1,0,1.0,-1,0.38,N,0,0.0,0.0,0.0
4,0.67,1,1,0,0,0.8,-1,0.50,O,1,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.67,1,0,0,0,1.0,-1,0.50,N,0,,,
96,0.61,1,0,0,0,0.8,0,0.50,N,0,,,
97,0.67,1,1,1,0,1.0,-1,0.31,N,0,,,
98,0.64,1,0,1,0,1.0,0,0.19,N,0,,,


# **Decision Tree**

In [96]:
from sklearn import tree
model = tree.DecisionTreeClassifier()
model.fit(X_train,y_train)

DecisionTreeClassifier()

In [97]:
#To predict occupancy using the test's set features
model_pred=model.predict(X_test)
print(model_pred)

[0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0]


In [98]:
# Transfer the array of predicted popularity of test data into dataframe
model_pred_df = pd.DataFrame(data=model_pred, columns=['Output_enc'])
model_pred_df.head(10)

Unnamed: 0,Output_enc
0,0
1,0
2,0
3,0
4,0
5,1
6,0
7,1
8,0
9,0


In [99]:
DTprob_each = model.predict_proba(X_test)
DTprob_each

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.]])

In [100]:
pos_prob = []
for inner in DTprob_each:
    pos_prob.append(inner[1])
pos_prob[0:10]


[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0]

In [101]:
# Insert result into test_wtpopl, check the prediction correctness
data.insert(len(data.columns), 'DecisionT', clf_pred_df['Output_enc'])

In [102]:
data

Unnamed: 0,Age(18-36),ChildishDiseases,Accidents,SurgicalIntervention,HighFevers,FreqAlcohol,Smoking,HoursSit,Output,Output_enc,RForest,NBGaus,KNeigh,DecisionT
0,0.69,0,1,1,0,0.8,0,0.88,N,0,0.0,0.0,0.0,0.0
1,0.94,1,0,1,0,0.8,1,0.31,O,1,0.0,0.0,0.0,0.0
2,0.50,1,0,0,0,1.0,-1,0.50,N,0,0.0,0.0,0.0,0.0
3,0.75,0,1,1,0,1.0,-1,0.38,N,0,0.0,0.0,0.0,0.0
4,0.67,1,1,0,0,0.8,-1,0.50,O,1,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.67,1,0,0,0,1.0,-1,0.50,N,0,,,,
96,0.61,1,0,0,0,0.8,0,0.50,N,0,,,,
97,0.67,1,1,1,0,1.0,-1,0.31,N,0,,,,
98,0.64,1,0,1,0,1.0,0,0.19,N,0,,,,


# **Evaluation of Accuracy Score for Classifiers**

## Random Forest Scores

In [103]:
from sklearn.metrics import accuracy_score

ac=accuracy_score(y_test,clf_pred)
print(ac)

1.0


## Naive Bayes Scores

In [104]:
from sklearn.metrics import accuracy_score

ac=accuracy_score(y_test,NB_pred)
#y_true = test_df_occ['Occupancy']
#y_true.head()
print(ac)

1.0


## KNN Scores

In [105]:
from sklearn.metrics import accuracy_score

ac=accuracy_score(y_test,KNN_pred)
#y_true = test_df_occ['Occupancy']
#y_true.head()
print(ac)

0.9


## Decision Tree Scores

In [106]:
from sklearn.metrics import accuracy_score

ac=accuracy_score(y_test,model_pred)
#y_true = test_df_occ['Occupancy']
#y_true.head()
print(ac)

1.0
