##Import Data and Python Packages

In [334]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_score
from scipy import stats

sns.set()
stats.chisqprob = lambda chiq, df:stats.chi2.sf(chisq, sf)

In [335]:
raw_data = pd.read_csv("DSChallenge.csv")

In [336]:
raw_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,ID1,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,ID2,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,ID3,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,ID4,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,ID5,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [337]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [338]:
raw_data=raw_data.rename({"Loan_ID": "loan_id", "ApplicantIncome":"applicant_income", 
                 "CoApplicantIncome":"co_applicant_income", "LoanAmount":"loan_amount", 
                 "Loan_Amount_Term":"loan_amount_term", "Credit_History":"credit_history",
                 "Property_Area":"property_area", "Loan_Status":"loan_status"},  axis = 1)

In [339]:
columns=[]
for col in raw_data.columns:
  col = col.lower()
  columns.append(col) 

In [340]:
raw_data.columns = columns
raw_data.columns

Index(['loan_id', 'gender', 'married', 'dependents', 'education',
       'self_employed', 'applicant_income', 'coapplicantincome', 'loan_amount',
       'loan_amount_term', 'credit_history', 'property_area', 'loan_status'],
      dtype='object')

##Preprocessing

In [341]:
raw_data.describe(include = 'all')

Unnamed: 0,loan_id,gender,married,dependents,education,self_employed,applicant_income,coapplicantincome,loan_amount,loan_amount_term,credit_history,property_area,loan_status
count,614,601,611,599.0,614,582,614.0,614.0,592.0,600.0,564.0,614,614
unique,614,2,2,4.0,2,2,,,,,,3,2
top,ID388,Male,Yes,0.0,Graduate,No,,,,,,Semiurban,Y
freq,1,489,398,345.0,480,500,,,,,,233,422
mean,,,,,,,5403.459283,1621.245798,146.412162,342.0,0.842199,,
std,,,,,,,6109.041673,2926.248369,85.587325,65.12041,0.364878,,
min,,,,,,,150.0,0.0,9.0,12.0,0.0,,
25%,,,,,,,2877.5,0.0,100.0,360.0,1.0,,
50%,,,,,,,3812.5,1188.5,128.0,360.0,1.0,,
75%,,,,,,,5795.0,2297.25,168.0,360.0,1.0,,


In [342]:
data = raw_data.drop(['gender', 'loan_id'], axis=1)

1. Each variable has a different number of observations, which implies that there are some missing values 
2. *gender* has 489 male entries, which is almost 80% of the data. This feature probably won't be useful


In [343]:
data.describe(include='all')

Unnamed: 0,married,dependents,education,self_employed,applicant_income,coapplicantincome,loan_amount,loan_amount_term,credit_history,property_area,loan_status
count,611,599.0,614,582,614.0,614.0,592.0,600.0,564.0,614,614
unique,2,4.0,2,2,,,,,,3,2
top,Yes,0.0,Graduate,No,,,,,,Semiurban,Y
freq,398,345.0,480,500,,,,,,233,422
mean,,,,,5403.459283,1621.245798,146.412162,342.0,0.842199,,
std,,,,,6109.041673,2926.248369,85.587325,65.12041,0.364878,,
min,,,,,150.0,0.0,9.0,12.0,0.0,,
25%,,,,,2877.5,0.0,100.0,360.0,1.0,,
50%,,,,,3812.5,1188.5,128.0,360.0,1.0,,
75%,,,,,5795.0,2297.25,168.0,360.0,1.0,,


###Dealing With Missing Values

In [344]:
data.isnull().sum()

married               3
dependents           15
education             0
self_employed        32
applicant_income      0
coapplicantincome     0
loan_amount          22
loan_amount_term     14
credit_history       50
property_area         0
loan_status           0
dtype: int64

In [345]:
married=data[data["married"].isnull()][["dependents", "self_employed"]]
married

Unnamed: 0,dependents,self_employed
104,,No
228,,No
435,,No


In [346]:
data["married"] = data["married"].replace(np.nan, "No")
data["married"].isnull().sum()

0

In [347]:
data["self_employed"]=data["self_employed"].replace(np.nan, "No")
data["self_employed"].isnull().sum()

0

In [348]:
data["loan_amount"].fillna(data["loan_amount"].median(skipna=True), inplace=True)
data["loan_amount"].isnull().sum()

0

In [349]:
data["loan_amount_term"].fillna(data["loan_amount_term"].median(skipna=True), inplace=True)
data["loan_amount_term"].isnull().sum()

0

In [350]:
data["credit_history"].fillna(data["credit_history"].median(skipna=True), inplace=True)
data["credit_history"].isnull().sum()

0

In [351]:
data_no_mv = data.dropna(axis=0)

In [352]:
data_no_mv.isnull().sum()

married              0
dependents           0
education            0
self_employed        0
applicant_income     0
coapplicantincome    0
loan_amount          0
loan_amount_term     0
credit_history       0
property_area        0
loan_status          0
dtype: int64

In [353]:
data_no_mv.describe(include='all')

Unnamed: 0,married,dependents,education,self_employed,applicant_income,coapplicantincome,loan_amount,loan_amount_term,credit_history,property_area,loan_status
count,599,599.0,599,599,599.0,599.0,599.0,599.0,599.0,599,599
unique,2,4.0,2,2,,,,,,3,2
top,Yes,0.0,Graduate,No,,,,,,Semiurban,Y
freq,390,345.0,469,519,,,,,,227,413
mean,,,,,5408.864775,1634.896361,146.188648,342.070117,0.859766,,
std,,,,,6168.473138,2953.144575,84.954303,64.221516,0.34752,,
min,,,,,150.0,0.0,9.0,12.0,0.0,,
25%,,,,,2853.0,0.0,100.0,360.0,1.0,,
50%,,,,,3775.0,1213.0,128.0,360.0,1.0,,
75%,,,,,5816.5,2304.0,165.5,360.0,1.0,,


In [354]:
sorted(data_no_mv)

['applicant_income',
 'coapplicantincome',
 'credit_history',
 'dependents',
 'education',
 'loan_amount',
 'loan_amount_term',
 'loan_status',
 'married',
 'property_area',
 'self_employed']

##Dealing with outliers

In [355]:
q1, q3 = np.percentile(data_no_mv["applicant_income"], [25,75])
print(q1)
print(q3)
iqr = q3-q1
lower_bound = q1 - (1.5*iqr)
upper_bound = q3 + (1.5*iqr)
print(lower_bound)
print(upper_bound)
print(data_no_mv[data_no_mv["applicant_income"]>10208]["applicant_income"])
mean = data_no_mv["applicant_income"].mean()
print("Mean: ",mean)
data_no_mv["applicant_income"]=data_no_mv[data_no_mv["applicant_income"]>10208]["applicant_income"] = mean
data_no_mv["applicant_income"].max()

2853.0
5816.5
-1592.25
10261.75
9      12841
34     12500
54     11500
67     10750
106    11417
115    14583
119    10408
126    23803
128    10513
130    20166
138    14999
144    11757
146    14866
155    39999
171    51763
183    33846
185    39147
191    12000
199    11000
254    16250
258    14683
271    11146
278    14583
284    20667
308    20233
324    15000
333    63337
369    19730
370    15759
409    81000
424    14880
432    12876
438    10416
443    37719
467    16692
475    16525
478    16667
483    10833
487    18333
493    17263
506    20833
509    13262
525    17500
533    11250
534    18165
561    19484
572    16666
594    16120
604    12000
Name: applicant_income, dtype: int64
Mean:  5408.864774624374


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


5408.864774624374

In [356]:
q1, q3 = np.percentile(data_no_mv["coapplicantincome"], [25,75])
print(q1)
print(q3)
iqr = q3-q1
lower_bound = q1 - (1.5*iqr)
upper_bound = q3 + (1.5*iqr)
print("Lower bound:",lower_bound)
print("Upper bound:",upper_bound)
print(data_no_mv[data_no_mv["coapplicantincome"]>5407]["coapplicantincome"])
mean = data_no_mv["coapplicantincome"].mean()
print("Mean: ",median)
data_no_mv["coapplicantincome"]=data_no_mv["coapplicantincome"].replace(to_replace =[5625, 5654, 5701, 5500, 5624], 
                            value = mean)
print(data_no_mv["coapplicantincome"].max())

0.0
2304.0
Lower bound: -3456.0
Upper bound: 5760.0
9      10968.0
12      8106.0
21      5625.0
38      7210.0
91      5654.0
122     8980.0
135     7750.0
159     5625.0
173     5625.0
177    11300.0
180     7250.0
242     5701.0
253     7101.0
349     6250.0
372     7873.0
402    20000.0
417    20000.0
444     8333.0
502     5624.0
506     6667.0
513     6666.0
523     7166.0
530     5500.0
581    33837.0
600    41667.0
Name: coapplicantincome, dtype: float64
Mean:  1304.8531794309135
41667.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


###Encoding Categorical Data

In [357]:
obj_df=data_no_mv.select_dtypes(include=['object']).copy()
obj_df.head()

Unnamed: 0,married,dependents,education,self_employed,property_area,loan_status
0,No,0,Graduate,No,Urban,Y
1,Yes,1,Graduate,No,Rural,N
2,Yes,0,Graduate,Yes,Urban,Y
3,Yes,0,Not Graduate,No,Urban,Y
4,No,0,Graduate,No,Urban,Y


In [358]:
obj_df["married"].value_counts()

Yes    390
No     209
Name: married, dtype: int64

In [359]:
obj_df["dependents"].value_counts()

0     345
1     102
2     101
3+     51
Name: dependents, dtype: int64

In [360]:
obj_df["education"].value_counts()

Graduate        469
Not Graduate    130
Name: education, dtype: int64

In [361]:
obj_df["self_employed"].value_counts()

No     519
Yes     80
Name: self_employed, dtype: int64

In [362]:
obj_df["property_area"].value_counts()

Semiurban    227
Urban        195
Rural        177
Name: property_area, dtype: int64

In [363]:
obj_df["loan_status"].value_counts()

Y    413
N    186
Name: loan_status, dtype: int64

In [364]:
cleanup_nums = {"married":     {"Yes": 1, "No": 0},
                "dependents": {"0": 0, "1": 1, "2": 2, "3+": 3},
                "education" : {"Graduate": 1, "Not Graduate" : 0},
                "self_employed": {"Yes" : 1, "No" : 0},
                "property_area": {"Rural" : 0, "Semiurban" :1, "Urban":2},
                "loan_status": {"N" : 0, "Y":1}
               }

In [365]:
obj_df = obj_df.replace(cleanup_nums)
obj_df.head()

Unnamed: 0,married,dependents,education,self_employed,property_area,loan_status
0,0,0,1,0,2,1
1,1,1,1,0,0,0
2,1,0,1,1,2,1
3,1,0,0,0,2,1
4,0,0,1,0,2,1


In [366]:
data_no_mv[['married', 'dependents', 'education', 'self_employed', 'property_area', 'loan_status']]=obj_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [367]:
data_no_mv.head()

Unnamed: 0,married,dependents,education,self_employed,applicant_income,coapplicantincome,loan_amount,loan_amount_term,credit_history,property_area,loan_status
0,0,0,1,0,5408.864775,0.0,128.0,360.0,1.0,2,1
1,1,1,1,0,5408.864775,1508.0,128.0,360.0,1.0,0,0
2,1,0,1,1,5408.864775,0.0,66.0,360.0,1.0,2,1
3,1,0,0,0,5408.864775,2358.0,120.0,360.0,1.0,2,1
4,0,0,1,0,5408.864775,0.0,141.0,360.0,1.0,2,1


In [368]:
data_no_mv.columns

Index(['married', 'dependents', 'education', 'self_employed',
       'applicant_income', 'coapplicantincome', 'loan_amount',
       'loan_amount_term', 'credit_history', 'property_area', 'loan_status'],
      dtype='object')

In [369]:
X=data_no_mv.iloc[:, :-1].values

In [370]:
y = data_no_mv.iloc[:,-1].values

In [371]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

###Feature Scaling

In [372]:
from sklearn.preprocessing import StandardScaler
X_train[:, 4:8]

array([[5408.86, 1881.00, 167.00, 360.00],
       [5408.86, 0.00, 36.00, 360.00],
       [5408.86, 0.00, 67.00, 360.00],
       ...,
       [5408.86, 0.00, 81.00, 300.00],
       [5408.86, 5000.00, 120.00, 360.00],
       [5408.86, 1459.00, 144.00, 360.00]])

##Training the Model

###Logistic Regression (Logit function)

In [373]:
x = sm.add_constant(X_train)
reg_log = sm.Logit(y_train, x)
results_log=reg_log.fit()

Optimization terminated successfully.
         Current function value: 0.465968
         Iterations 6


In [374]:
results_log.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,479.0
Model:,Logit,Df Residuals:,469.0
Method:,MLE,Df Model:,9.0
Date:,"Sat, 17 Jul 2021",Pseudo R-squ.:,0.2422
Time:,16:32:04,Log-Likelihood:,-223.2
converged:,True,LL-Null:,-294.53
Covariance Type:,nonrobust,LLR p-value:,2.918e-26

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
x1,0.5285,0.264,1.999,0.046,0.010,1.047
x2,-0.0051,0.132,-0.038,0.969,-0.264,0.254
x3,0.3851,0.293,1.314,0.189,-0.189,0.959
x4,-0.0740,0.349,-0.212,0.832,-0.759,0.611
x5,-0.0005,0.000,-3.060,0.002,-0.001,-0.000
x6,-5.24e-05,3.91e-05,-1.341,0.180,-0.000,2.42e-05
x7,-7.226e-05,0.002,-0.048,0.962,-0.003,0.003
x8,-0.0010,0.002,-0.509,0.611,-0.005,0.003
x9,3.8948,0.486,8.007,0.000,2.941,4.848


In [375]:
np.set_printoptions(formatter={'float': lambda x: '{0:0.2f}'.format(x)})
results_log.predict()

array([0.84, 0.85, 0.86, 0.77, 0.71, 0.86, 0.73, 0.67, 0.07, 0.83, 0.84,
       0.85, 0.76, 0.89, 0.86, 0.32, 0.78, 0.77, 0.78, 0.84, 0.73, 0.82,
       0.79, 0.76, 0.86, 0.82, 0.86, 0.74, 0.63, 0.82, 0.80, 0.84, 0.83,
       0.78, 0.67, 0.81, 0.88, 0.87, 0.83, 0.82, 0.76, 0.78, 0.78, 0.09,
       0.07, 0.82, 0.75, 0.86, 0.76, 0.88, 0.84, 0.84, 0.83, 0.81, 0.88,
       0.06, 0.81, 0.06, 0.71, 0.78, 0.08, 0.86, 0.82, 0.78, 0.75, 0.78,
       0.78, 0.73, 0.69, 0.83, 0.77, 0.86, 0.82, 0.77, 0.84, 0.74, 0.84,
       0.86, 0.77, 0.75, 0.76, 0.05, 0.75, 0.79, 0.05, 0.67, 0.06, 0.82,
       0.81, 0.82, 0.75, 0.79, 0.81, 0.85, 0.79, 0.77, 0.85, 0.75, 0.84,
       0.06, 0.84, 0.76, 0.78, 0.75, 0.86, 0.78, 0.78, 0.08, 0.83, 0.85,
       0.73, 0.10, 0.07, 0.82, 0.85, 0.86, 0.85, 0.87, 0.83, 0.77, 0.86,
       0.72, 0.06, 0.75, 0.77, 0.06, 0.07, 0.71, 0.76, 0.86, 0.76, 0.77,
       0.86, 0.75, 0.84, 0.77, 0.82, 0.83, 0.09, 0.85, 0.72, 0.85, 0.78,
       0.10, 0.69, 0.64, 0.85, 0.85, 0.78, 0.70, 0.

In [376]:
np.array(data_no_mv['loan_status'])

array([1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0,

In [377]:
results_log.pred_table()

array([[62.00, 84.00],
       [5.00, 328.00]])

In [378]:
cm_df = pd.DataFrame(results_log.pred_table())
cm_df.columns = ['Predicted 0', 'Predicted 1']
cm_df = cm_df.rename(index={0:'Actual 0', 1: 'Actual 1'})
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,62.0,84.0
Actual 1,5.0,328.0


In [379]:
cm = np.array(cm_df)
accuracy_score = (cm[0,0]+cm[1,1])/cm.sum()
print("Accuracy score: %.2f" %(accuracy_score*100), "%")

Accuracy score: 81.42 %


###Testing the Model

In [380]:
x_test = sm.add_constant(X_test)
reg_log = sm.Logit(y_test, x_test)

In [381]:
def confusion_matrix(data,actual_values,model):
    
        pred_values = model.predict(data)
        bins=np.array([0,0.5,1])
        cm = np.histogram2d(actual_values, pred_values, bins=bins)[0]
        accuracy = (cm[0,0]+cm[1,1])/cm.sum()
        return cm, accuracy

In [382]:
cm = confusion_matrix(x_test,y_test,results_log)
cm

(array([[17.00, 23.00],
        [2.00, 78.00]]), 0.7916666666666666)

In [383]:
cm_df = pd.DataFrame(cm[0])
cm_df.columns = ['Predicted 0','Predicted 1']
cm_df = cm_df.rename(index={0: 'Actual 0',1:'Actual 1'})
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,17.0,23.0
Actual 1,2.0,78.0


In [384]:
print ('Missclassification rate: %.2f' %(((25/(17+23+2+78))*100)), "%")

Missclassification rate: 20.83 %


###Model evaluation based on K-fold-cross validation

In [385]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [386]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
a_s=accuracy_score(y_test, y_pred)
a_s*=100
print("Accuracy score: %.2f"%a_s)

Accuracy score: 77.50


In [388]:
accuracies = cross_val_score(estimator = classifier, X=X_train, y=y_train, cv=10)
print("Accuracy  (K-fold-cross validation): {:.2f}%".format(accuracies.mean()*100))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Accuracy  (K-fold-cross validation): 81.21%
