# Logistics regression assignment

## Dataset
The dataset I chose is the affairs dataset that comes with Statsmodels. It was derived from a survey of women in 1974 by Redbook magazine, in which married women were asked about their participation in extramarital affairs. More information about
the study is available in a 1978 paper from the Journal of Political Economy.

## Description of Variables
1. The dataset contains 6366 observations of 9 variables: rate_marriage: woman's rating of her marriage (1 = very poor, 5 =very good)
2. age: woman's age
3. yrs_married: number of years married
4. children: number of children
5. religious: woman's rating of how religious she is (1 = not religious, 4 =strongly religious)
6. educ: level of education (9 = grade school, 12 = high school, 14 = some college, 16 = college graduate, 17 = some graduate school, 20 = advanced degree)
7. occupation: woman's occupation (1 = student, 2 = farming/semi- skilled/unskilled, 3 = "white collar", 4 =teacher/nurse/writer/technician/skilled, 5 = managerial/business, 6 = professional with advanced degree)
8. occupation_husb: husband's occupation (same coding as above)
9. affairs: time spent in extra-marital affairs

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split
from sklearn import metrics 
from sklearn.model_selection import cross_val_score 

In [2]:
dta = sm.datasets.fair.load_pandas().data

In [3]:
#add "affair" column: 1 represents having affairs, 0 represents not 
dta['affair'] = (dta.affairs >0).astype(int)

In [4]:
dta.head()

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,occupation,occupation_husb,affairs,affair
0,3.0,32.0,9.0,3.0,3.0,17.0,2.0,5.0,0.111111,1
1,3.0,27.0,13.0,3.0,1.0,14.0,3.0,4.0,3.230769,1
2,4.0,22.0,2.5,0.0,1.0,16.0,3.0,5.0,1.4,1
3,4.0,37.0,16.5,4.0,3.0,16.0,5.0,5.0,0.727273,1
4,5.0,27.0,9.0,1.0,1.0,14.0,3.0,4.0,4.666666,1


In [6]:
dta.shape

(6366, 10)

In [10]:
dta.describe()

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,occupation,occupation_husb,affairs,affair
count,6366.0,6366.0,6366.0,6366.0,6366.0,6366.0,6366.0,6366.0,6366.0,6366.0
mean,4.109645,29.082862,9.009425,1.396874,2.42617,14.209865,3.424128,3.850141,0.705374,0.322495
std,0.96143,6.847882,7.28012,1.433471,0.878369,2.178003,0.942399,1.346435,2.203374,0.467468
min,1.0,17.5,0.5,0.0,1.0,9.0,1.0,1.0,0.0,0.0
25%,4.0,22.0,2.5,0.0,2.0,12.0,3.0,3.0,0.0,0.0
50%,4.0,27.0,6.0,1.0,2.0,14.0,3.0,4.0,0.0,0.0
75%,5.0,32.0,16.5,2.0,3.0,16.0,4.0,5.0,0.484848,1.0
max,5.0,42.0,23.0,5.5,4.0,20.0,6.0,6.0,57.599991,1.0


In [11]:
dta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6366 entries, 0 to 6365
Data columns (total 10 columns):
rate_marriage      6366 non-null float64
age                6366 non-null float64
yrs_married        6366 non-null float64
children           6366 non-null float64
religious          6366 non-null float64
educ               6366 non-null float64
occupation         6366 non-null float64
occupation_husb    6366 non-null float64
affairs            6366 non-null float64
affair             6366 non-null int32
dtypes: float64(9), int32(1)
memory usage: 472.6 KB


In [12]:
y, X = dmatrices('affair ~ rate_marriage + age + yrs_married + children + \
                  religious + educ + C(occupation) + C(occupation_husb)',
                  dta, return_type="dataframe")
X = X.rename(columns = {'C(occupation)[T.2.0]':'occ_2',
'C(occupation)[T.3.0]':'occ_3',
'C(occupation)[T.4.0]':'occ_4',
'C(occupation)[T.5.0]':'occ_5',
'C(occupation)[T.6.0]':'occ_6',
'C(occupation_husb)[T.2.0]':'occ_husb_2',
'C(occupation_husb)[T.3.0]':'occ_husb_3',
'C(occupation_husb)[T.4.0]':'occ_husb_4',
'C(occupation_husb)[T.5.0]':'occ_husb_5',
'C(occupation_husb)[T.6.0]':'occ_husb_6'})

y = np.ravel(y)

In [13]:
X

Unnamed: 0,Intercept,occ_2,occ_3,occ_4,occ_5,occ_6,occ_husb_2,occ_husb_3,occ_husb_4,occ_husb_5,occ_husb_6,rate_marriage,age,yrs_married,children,religious,educ
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,32.0,9.0,3.0,3.0,17.0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,27.0,13.0,3.0,1.0,14.0
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,22.0,2.5,0.0,1.0,16.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,37.0,16.5,4.0,3.0,16.0
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,5.0,27.0,9.0,1.0,1.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6361,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,5.0,32.0,13.0,2.0,3.0,17.0
6362,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,32.0,13.0,1.0,1.0,16.0
6363,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,22.0,2.5,0.0,2.0,14.0
6364,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,5.0,32.0,6.0,1.0,3.0,14.0


In [14]:
y

array([1., 1., 1., ..., 0., 0., 0.])

In [15]:
X.describe()

Unnamed: 0,Intercept,occ_2,occ_3,occ_4,occ_5,occ_6,occ_husb_2,occ_husb_3,occ_husb_4,occ_husb_5,occ_husb_6,rate_marriage,age,yrs_married,children,religious,educ
count,6366.0,6366.0,6366.0,6366.0,6366.0,6366.0,6366.0,6366.0,6366.0,6366.0,6366.0,6366.0,6366.0,6366.0,6366.0,6366.0,6366.0
mean,1.0,0.134936,0.437166,0.288093,0.116243,0.017122,0.205467,0.076971,0.318882,0.279453,0.083255,4.109645,29.082862,9.009425,1.396874,2.42617,14.209865
std,0.0,0.341682,0.496075,0.45291,0.320541,0.129737,0.404074,0.266567,0.46608,0.448766,0.276289,0.96143,6.847882,7.28012,1.433471,0.878369,2.178003
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,17.5,0.5,0.0,1.0,9.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,22.0,2.5,0.0,2.0,12.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,27.0,6.0,1.0,2.0,14.0
75%,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,5.0,32.0,16.5,2.0,3.0,16.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0,42.0,23.0,5.5,4.0,20.0


In [17]:
X = X.drop(columns = ['Intercept'])

In [18]:
X['religious'].unique()

array([3., 1., 2., 4.])

In [19]:
X['educ'].unique()

array([17., 14., 16., 12., 20.,  9.])

In [20]:
X['rate_marriage'].unique()

array([3., 4., 5., 2., 1.])

In [23]:
X.isnull().sum()

occ_2            0
occ_3            0
occ_4            0
occ_5            0
occ_6            0
occ_husb_2       0
occ_husb_3       0
occ_husb_4       0
occ_husb_5       0
occ_husb_6       0
rate_marriage    0
age              0
yrs_married      0
children         0
religious        0
educ             0
dtype: int64

no empty values as seen above so we are good to go

In [24]:
from sklearn.preprocessing import StandardScaler

In [25]:
scalar = StandardScaler()
X_scaled = scalar.fit_transform(X)

In [26]:
x_train,x_test,y_train,y_test = train_test_split(X_scaled,y, test_size= 0.25, random_state = 355)

In [27]:
log_reg = LogisticRegression()

log_reg.fit(x_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [28]:
y_pred = log_reg.predict(x_test)

In [29]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score

In [30]:
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.7204773869346733

In [31]:
conf_mat = confusion_matrix(y_test,y_pred)
conf_mat

array([[970, 114],
       [331, 177]], dtype=int64)

### 1- affair (positive) , 0- no affair (negative)
1.  970 women have affair and model predicted correctly
2. 114 women have affair and model predicted incorrectly ( Type 1 error)
3.  331 women dont have an affair and model predicted incorrectly (Type 2 Error)
4. 177 women dont have an affair and model predicted correctly

In [36]:
true_positive = conf_mat[0][0]
false_positive = conf_mat[0][1]
false_negative = conf_mat[1][0]
true_negative = conf_mat[1][1]

In [37]:
# Precison
Precision = true_positive/(true_positive+false_positive)
Precision

0.8948339483394834

In [38]:
# Recall
Recall = true_positive/(true_positive+false_negative)
Recall

0.7455803228285934

In [39]:
# F1 Score
F1_Score = 2*(Recall * Precision) / (Recall + Precision)
F1_Score


0.8134171907756814

In [40]:
# Area Under Curve
auc = roc_auc_score(y_test, y_pred)
auc

0.6216295725949386