In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import re
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


In [2]:
hung_comparison = pd.read_csv("hung_comparison.csv")

In [3]:
alb_bulg = hung_comparison[hung_comparison['Language'] < 3].copy()

In [4]:
alb_rom = hung_comparison[hung_comparison['Language'] != 2].copy()

In [5]:
bulg_rom = hung_comparison[hung_comparison['Language'] > 1].copy()

In [6]:
def change_lang(float):
    if float == 3:
        return 'Romanian'
    elif float == 2:
        return 'Bulgarian'
    else:
        return 'Albanian'

In [7]:
ab = []
for i in alb_bulg['Language']:
    ab.append(change_lang(i))

In [8]:
alb_bulg['Language Name'] = ab


In [9]:
alb_bulg['Common Char'] = [float(i) for i in alb_bulg['Common Char']]

In [10]:
ar = []
for i in alb_rom['Language']:
    ar.append(change_lang(i))

In [11]:
alb_rom['Language Name'] = ar

In [12]:
br = []
for i in bulg_rom['Language']:
    br.append(change_lang(i))

In [13]:
bulg_rom['Language Name'] = br

In [14]:
alb_rom['Common Char'] = [float(i) for i in alb_rom['Common Char']]
bulg_rom['Common Char'] = [float(i) for i in bulg_rom['Common Char']]

## Alb / Bulg:

In [15]:
y = np.ravel(alb_bulg[['Language Name']])

model = LogisticRegression().fit(alb_bulg[['Common Char']], y)

predict = model.predict(alb_bulg[['Common Char']])

print('slope: {:.7f}'.format(float(model.coef_)))

slope: 0.2841944


In [16]:
X_train, X_test, y_train, y_test = train_test_split(alb_bulg[['Common Char']], alb_bulg[['Language Name']], test_size=0.20, random_state=15)

print('X_test:')
print(X_test[:5])
print('')

print('X_train:')
print(X_train[:5])
print('')

print('y_test:')
print(y_test[:5])
print('')

print('y_train:')
print(y_train[:5])

X_test:
      Common Char
1969          3.0
3118          4.0
722           1.0
249           1.0
2861          0.0

X_train:
      Common Char
3424          0.0
2194          3.0
1015          1.0
701           1.0
252           1.0

y_test:
     Language Name
1969     Bulgarian
3118     Bulgarian
722       Albanian
249       Albanian
2861     Bulgarian

y_train:
     Language Name
3424     Bulgarian
2194     Bulgarian
1015      Albanian
701       Albanian
252       Albanian


In [17]:
ab_test = pd.DataFrame({'Common Char': X_test['Common Char']})


In [18]:
predict_ab1 = model.predict(ab_test[['Common Char']])


In [19]:
predict_ab1 = np.ravel(predict_ab1)
print(predict_ab1)
y_test = np.ravel(y_test)
print(y_test)

accurate = 0

for i in range(len(predict_ab1)):
    if predict_ab1[i] == y_test[i]:
        accurate += 1

print('percent accurate: {:.1f}%'.format((accurate/len(y_test))*100))

['Bulgarian' 'Bulgarian' 'Albanian' 'Albanian' 'Albanian' 'Albanian'
 'Albanian' 'Bulgarian' 'Albanian' 'Bulgarian' 'Albanian' 'Albanian'
 'Bulgarian' 'Bulgarian' 'Albanian' 'Bulgarian' 'Albanian' 'Albanian'
 'Albanian' 'Bulgarian' 'Albanian' 'Albanian' 'Albanian' 'Albanian'
 'Albanian' 'Bulgarian' 'Albanian' 'Bulgarian' 'Bulgarian' 'Bulgarian'
 'Albanian' 'Albanian' 'Albanian' 'Albanian' 'Albanian' 'Bulgarian'
 'Albanian' 'Albanian' 'Albanian' 'Bulgarian' 'Bulgarian' 'Albanian'
 'Bulgarian' 'Albanian' 'Albanian' 'Albanian' 'Bulgarian' 'Bulgarian'
 'Albanian' 'Bulgarian' 'Albanian' 'Bulgarian' 'Bulgarian' 'Bulgarian'
 'Albanian' 'Albanian' 'Albanian' 'Albanian' 'Bulgarian' 'Bulgarian'
 'Albanian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Albanian' 'Bulgarian'
 'Albanian' 'Albanian' 'Albanian' 'Albanian' 'Albanian' 'Bulgarian'
 'Bulgarian' 'Albanian' 'Bulgarian' 'Albanian' 'Albanian' 'Albanian'
 'Albanian' 'Bulgarian' 'Albanian' 'Albanian' 'Albanian' 'Albanian'
 'Bulgarian' 'Albanian' 'Bulga

In [20]:
train_df = pd.DataFrame({'Common Char': X_train['Common Char']})

predict_two = model.predict(train_df[['Common Char']])

predict_two = np.ravel(predict_two)
print(predict_two)
y_train = np.ravel(y_train)
print(y_train)

accurate2 = 0

for i in range(len(predict_two)):
    if predict_two[i] == y_train[i]:
        accurate2 += 1

print('percent accurate: {:.1f}%'.format((accurate2/len(y_train))*100))

['Albanian' 'Bulgarian' 'Albanian' ... 'Albanian' 'Albanian' 'Albanian']
['Bulgarian' 'Bulgarian' 'Albanian' ... 'Bulgarian' 'Bulgarian'
 'Bulgarian']
percent accurate: 56.4%


## Alb / Rom:

In [21]:
y = np.ravel(alb_rom[['Language Name']])

model = LogisticRegression().fit(alb_rom[['Common Char']], y)

predict = model.predict(alb_rom[['Common Char']])

print('slope: {:.7f}'.format(float(model.coef_)))

slope: 0.1816676


In [22]:
X_train, X_test, y_train, y_test = train_test_split(alb_rom[['Common Char']], alb_rom[['Language Name']], test_size=0.20, random_state=15)

print('X_test:')
print(X_test[:5])
print('')

print('X_train:')
print(X_train[:5])
print('')

print('y_test:')
print(y_test[:5])
print('')

print('y_train:')
print(y_train[:5])

X_test:
      Common Char
854           1.0
1624          0.0
446           2.0
1277          2.0
843           1.0

X_train:
      Common Char
4185          0.0
3798          6.0
5131          5.0
4108          1.0
4             1.0

y_test:
     Language Name
854       Albanian
1624      Albanian
446       Albanian
1277      Albanian
843       Albanian

y_train:
     Language Name
4185      Romanian
3798      Romanian
5131      Romanian
4108      Romanian
4         Albanian


In [23]:
train_df = pd.DataFrame({'Common Char': X_train['Common Char']})

predict_two = model.predict(train_df[['Common Char']])

predict_two = np.ravel(predict_two)
print(predict_two)
y_train = np.ravel(y_train)
print(y_train)

accurate2 = 0

for i in range(len(predict_two)):
    if predict_two[i] == y_train[i]:
        accurate2 += 1

print('percent accurate: {:.1f}%'.format((accurate2/len(y_train))*100))

['Albanian' 'Romanian' 'Romanian' ... 'Albanian' 'Albanian' 'Romanian']
['Romanian' 'Romanian' 'Romanian' ... 'Romanian' 'Romanian' 'Romanian']
percent accurate: 54.1%


In [24]:
ar_test = pd.DataFrame({'Common Char': X_test['Common Char']})


predict_ar1 = model.predict(ar_test[['Common Char']])


predict_ar1 = np.ravel(predict_ar1)
print(predict_ar1)
y_test = np.ravel(y_test)
print(y_test)

accurate = 0

for i in range(len(predict_ar1)):
    if predict_ar1[i] == y_test[i]:
        accurate += 1

print('percent accurate: {:.1f}%'.format((accurate/len(y_test))*100))

['Albanian' 'Albanian' 'Romanian' 'Romanian' 'Albanian' 'Albanian'
 'Albanian' 'Albanian' 'Albanian' 'Albanian' 'Albanian' 'Albanian'
 'Romanian' 'Albanian' 'Albanian' 'Albanian' 'Albanian' 'Albanian'
 'Albanian' 'Albanian' 'Albanian' 'Romanian' 'Romanian' 'Albanian'
 'Romanian' 'Albanian' 'Romanian' 'Romanian' 'Romanian' 'Albanian'
 'Romanian' 'Romanian' 'Albanian' 'Albanian' 'Albanian' 'Albanian'
 'Albanian' 'Albanian' 'Albanian' 'Romanian' 'Romanian' 'Albanian'
 'Albanian' 'Romanian' 'Romanian' 'Albanian' 'Romanian' 'Albanian'
 'Albanian' 'Romanian' 'Albanian' 'Albanian' 'Albanian' 'Albanian'
 'Albanian' 'Romanian' 'Romanian' 'Albanian' 'Albanian' 'Albanian'
 'Albanian' 'Albanian' 'Albanian' 'Albanian' 'Albanian' 'Romanian'
 'Romanian' 'Albanian' 'Albanian' 'Albanian' 'Albanian' 'Romanian'
 'Romanian' 'Albanian' 'Albanian' 'Albanian' 'Albanian' 'Romanian'
 'Albanian' 'Albanian' 'Albanian' 'Romanian' 'Albanian' 'Albanian'
 'Romanian' 'Albanian' 'Romanian' 'Albanian' 'Albanian' 'Alban

## Bulg / Rom:

In [25]:
y = np.ravel(bulg_rom[['Language Name']])

model = LogisticRegression().fit(bulg_rom[['Common Char']], y)

predict = model.predict(bulg_rom[['Common Char']])

print('slope: {:.7f}'.format(float(model.coef_)))

slope: -0.0999854


In [26]:
X_train, X_test, y_train, y_test = train_test_split(bulg_rom[['Common Char']], bulg_rom[['Language Name']], test_size=0.20, random_state=15)

print('X_test:')
print(X_test[:5])
print('')

print('X_train:')
print(X_train[:5])
print('')

print('y_test:')
print(y_test[:5])
print('')

print('y_train:')
print(y_train[:5])

X_test:
      Common Char
2590          1.0
4871          2.0
5137          0.0
4747          0.0
3405          1.0

X_train:
      Common Char
4224          2.0
4162          1.0
4872          0.0
3360          2.0
2367          2.0

y_test:
     Language Name
2590     Bulgarian
4871      Romanian
5137      Romanian
4747      Romanian
3405     Bulgarian

y_train:
     Language Name
4224      Romanian
4162      Romanian
4872      Romanian
3360     Bulgarian
2367     Bulgarian


In [27]:
br_test = pd.DataFrame({'Common Char': X_test['Common Char']})


predict_br1 = model.predict(br_test[['Common Char']])


predict_br1 = np.ravel(predict_br1)
print(predict_br1)
y_test = np.ravel(y_test)
print(y_test)

accurate = 0

for i in range(len(predict_br1)):
    if predict_br1[i] == y_test[i]:
        accurate += 1

print('percent accurate: {:.1f}%'.format((accurate/len(y_test))*100))

['Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian'
 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian'
 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian'
 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian'
 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian'
 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian'
 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian'
 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian'
 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian'
 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian'
 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian'
 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian'
 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian'
 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'B

In [28]:
train_df = pd.DataFrame({'Common Char': X_train['Common Char']})

predict_two = model.predict(train_df[['Common Char']])

predict_two = np.ravel(predict_two)
print(predict_two)
y_train = np.ravel(y_train)
print(y_train)

accurate2 = 0

for i in range(len(predict_two)):
    if predict_two[i] == y_train[i]:
        accurate2 += 1

print('percent accurate: {:.1f}%'.format((accurate2/len(y_train))*100))

['Bulgarian' 'Bulgarian' 'Bulgarian' ... 'Bulgarian' 'Bulgarian'
 'Bulgarian']
['Romanian' 'Romanian' 'Romanian' ... 'Romanian' 'Romanian' 'Romanian']
percent accurate: 53.5%


In [29]:
y = np.ravel(bulg_rom[['Language Name']])

model = LogisticRegression().fit(bulg_rom[['Vowel Percent']], y)

predict = model.predict(bulg_rom[['Vowel Percent']])

print('slope: {:.7f}'.format(float(model.coef_)))

slope: 4.9531978


In [30]:
X_train, X_test, y_train, y_test = train_test_split(bulg_rom[['Vowel Percent']], bulg_rom[['Language Name']], test_size=0.20, random_state=15)

print('X_test:')
print(X_test[:5])
print('')

print('X_train:')
print(X_train[:5])
print('')

print('y_test:')
print(y_test[:5])
print('')

print('y_train:')
print(y_train[:5])

X_test:
      Vowel Percent
2590       0.375000
4871       0.500000
5137       0.400000
4747       0.400000
3405       0.333333

X_train:
      Vowel Percent
4224       0.600000
4162       0.500000
4872       0.750000
3360       0.250000
2367       0.363636

y_test:
     Language Name
2590     Bulgarian
4871      Romanian
5137      Romanian
4747      Romanian
3405     Bulgarian

y_train:
     Language Name
4224      Romanian
4162      Romanian
4872      Romanian
3360     Bulgarian
2367     Bulgarian


In [31]:
br_test = pd.DataFrame({'Vowel Percent': X_test['Vowel Percent']})


predict_br1 = model.predict(br_test[['Vowel Percent']])


predict_br1 = np.ravel(predict_br1)
print(predict_br1)
y_test = np.ravel(y_test)
print(y_test)

accurate = 0

for i in range(len(predict_br1)):
    if predict_br1[i] == y_test[i]:
        accurate += 1

print('percent accurate: {:.1f}%'.format((accurate/len(y_test))*100))

['Bulgarian' 'Romanian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian'
 'Romanian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian'
 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Romanian' 'Romanian' 'Bulgarian'
 'Bulgarian' 'Bulgarian' 'Romanian' 'Bulgarian' 'Romanian' 'Bulgarian'
 'Bulgarian' 'Bulgarian' 'Romanian' 'Bulgarian' 'Romanian' 'Romanian'
 'Bulgarian' 'Romanian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian'
 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian'
 'Romanian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian'
 'Romanian' 'Romanian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Romanian'
 'Bulgarian' 'Romanian' 'Bulgarian' 'Bulgarian' 'Romanian' 'Bulgarian'
 'Romanian' 'Romanian' 'Romanian' 'Romanian' 'Romanian' 'Bulgarian'
 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Romanian'
 'Bulgarian' 'Bulgarian' 'Romanian' 'Bulgarian' 'Bulgarian' 'Bulgarian'
 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Bulgarian' 'Romanian'
 '

In [32]:
y = np.ravel(alb_bulg[['Language Name']])

model = LogisticRegression().fit(alb_bulg[['Vowel Percent']], y)

predict = model.predict(alb_bulg[['Vowel Percent']])

print('slope: {:.7f}'.format(float(model.coef_)))

slope: 1.6006929


In [33]:
X_train, X_test, y_train, y_test = train_test_split(alb_bulg[['Vowel Percent']], alb_bulg[['Language Name']], test_size=0.20, random_state=15)

print('X_test:')
print(X_test[:5])
print('')

print('X_train:')
print(X_train[:5])
print('')

print('y_test:')
print(y_test[:5])
print('')

print('y_train:')
print(y_train[:5])

X_test:
      Vowel Percent
1969       0.428571
3118       0.333333
722        0.400000
249        0.200000
2861       0.400000

X_train:
      Vowel Percent
3424       0.444444
2194       0.333333
1015       0.250000
701        0.400000
252        0.333333

y_test:
     Language Name
1969     Bulgarian
3118     Bulgarian
722       Albanian
249       Albanian
2861     Bulgarian

y_train:
     Language Name
3424     Bulgarian
2194     Bulgarian
1015      Albanian
701       Albanian
252       Albanian


# Multivariate:

### Four variables:

    - Word length
    - Cons percent
    - Common char
    
    
https://towardsdatascience.com/building-a-logistic-regression-in-python-step-by-step-becd4d56c9c8

In [35]:
alb_bulg = alb_bulg.drop(columns=['Cons Percent'])
alb_bulg = alb_bulg.drop(columns=['Length'])

In [36]:
cat_vars=['Cons', 'Vowel','Common Char']
for var in cat_vars:
    cat_list='var'+'_'+var
    cat_list = pd.get_dummies(alb_bulg[var], prefix=var)
    data1=alb_bulg.join(cat_list)
    alb_bulg=data1

data_vars=alb_bulg.columns.values.tolist()
to_keep=[i for i in data_vars if i not in cat_vars]

In [37]:
data_final=alb_bulg[to_keep]


In [38]:
pip install -U imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [39]:
from imblearn.over_sampling import SMOTE


In [40]:
data_final

Unnamed: 0.1,Unnamed: 0,Word,Language,Vowel Percent,More Three,Language Name,Cons_0,Cons_1,Cons_2,Cons_3,...,Common Char_0.0,Common Char_1.0,Common Char_2.0,Common Char_3.0,Common Char_4.0,Common Char_5.0,Common Char_6.0,Common Char_7.0,Common Char_8.0,Common Char_10.0
0,0,bot,1.0,0.333333,No,Albanian,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
1,1,e,1.0,1.000000,No,Albanian,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,2,rokulialem,1.0,0.500000,Yes,Albanian,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,3,dua,1.0,0.666667,No,Albanian,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
4,4,tok,1.0,0.333333,No,Albanian,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3635,3635,proklinam,2.0,0.333333,Yes,Bulgarian,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3636,3636,kalna,2.0,0.400000,Yes,Bulgarian,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
3637,3637,postja,2.0,0.333333,No,Bulgarian,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3638,3638,nebe,2.0,0.500000,No,Bulgarian,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0


In [41]:
is_alb = []
for i in data_final['Language Name']:
    if i == 'Albanian':
        is_alb.append(1)
    else:
        is_alb.append(0)
    

In [42]:
data_final['y'] = is_alb

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_final['y'] = is_alb


In [43]:
data_final = data_final.drop(columns=['Vowel Percent','Unnamed: 0', 'Word', 'More Three', 'Language Name', 'Language'])

In [44]:
data_final.columns

Index(['Cons_0', 'Cons_1', 'Cons_2', 'Cons_3', 'Cons_4', 'Cons_5', 'Cons_6',
       'Cons_7', 'Cons_8', 'Cons_9', 'Cons_10', 'Cons_11', 'Cons_12',
       'Vowel_0', 'Vowel_1', 'Vowel_2', 'Vowel_3', 'Vowel_4', 'Vowel_5',
       'Vowel_6', 'Vowel_7', 'Vowel_9', 'Common Char_0.0', 'Common Char_1.0',
       'Common Char_2.0', 'Common Char_3.0', 'Common Char_4.0',
       'Common Char_5.0', 'Common Char_6.0', 'Common Char_7.0',
       'Common Char_8.0', 'Common Char_10.0', 'y'],
      dtype='object')

In [45]:
data_final['y'].value_counts()

0    1865
1    1775
Name: y, dtype: int64

In [46]:
X = data_final.loc[:, data_final.columns != 'y']
y = data_final.loc[:, data_final.columns == 'y']

from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=15)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)



In [47]:
columns = X_train.columns

In [48]:
os_data_X,os_data_y=os.fit_sample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])

# we can Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print("Number of no subscription in oversampled data",len(os_data_y[os_data_y['y']==0]))
print("Number of subscription",len(os_data_y[os_data_y['y']==1]))
print("Proportion of no subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==0])/len(os_data_X))
print("Proportion of subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==1])/len(os_data_X))

length of oversampled data is  2636
Number of no subscription in oversampled data 1318
Number of subscription 1318
Proportion of no subscription data in oversampled data is  0.5
Proportion of subscription data in oversampled data is  0.5


In [49]:
import statsmodels.api as sm
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


         Current function value: inf
         Iterations: 35


LinAlgError: Singular matrix

In [51]:
alb_bulg.columns

Index(['Unnamed: 0', 'Word', 'Vowel', 'Cons', 'Language', 'Common Char',
       'Vowel Percent', 'More Three', 'Language Name', 'Cons_0', 'Cons_1',
       'Cons_2', 'Cons_3', 'Cons_4', 'Cons_5', 'Cons_6', 'Cons_7', 'Cons_8',
       'Cons_9', 'Cons_10', 'Cons_11', 'Cons_12', 'Vowel_0', 'Vowel_1',
       'Vowel_2', 'Vowel_3', 'Vowel_4', 'Vowel_5', 'Vowel_6', 'Vowel_7',
       'Vowel_9', 'Common Char_0.0', 'Common Char_1.0', 'Common Char_2.0',
       'Common Char_3.0', 'Common Char_4.0', 'Common Char_5.0',
       'Common Char_6.0', 'Common Char_7.0', 'Common Char_8.0',
       'Common Char_10.0'],
      dtype='object')

In [52]:
alb_bulg = alb_bulg.drop(columns=['Vowel Percent','Unnamed: 0', 'Word', 'More Three', 'Language Name',  'Common Char', 'Common Char_0.0', 'Common Char_1.0', 'Common Char_2.0',
       'Common Char_3.0', 'Common Char_4.0', 'Common Char_5.0',
       'Common Char_6.0', 'Common Char_7.0', 'Common Char_8.0',
       'Common Char_10.0'])

In [53]:
alb_bulg

Unnamed: 0,Vowel,Cons,Language,Cons_0,Cons_1,Cons_2,Cons_3,Cons_4,Cons_5,Cons_6,...,Cons_12,Vowel_0,Vowel_1,Vowel_2,Vowel_3,Vowel_4,Vowel_5,Vowel_6,Vowel_7,Vowel_9
0,1,2,1.0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,1,0,1.0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,5,5,1.0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,2,1,1.0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,1,2,1.0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3635,3,6,2.0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
3636,2,3,2.0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3637,2,4,2.0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
3638,2,2,2.0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [55]:
array = []
for i in alb_bulg['Language']:
    if i == 1.0:
        array.append(1)
    elif i == 2.0:
        array.append(0)

In [56]:
alb_bulg['y'] = array

In [58]:
alb_bulg = alb_bulg.drop(columns=[ 'Language', 'Vowel', 'Cons'])

In [60]:
data_final = alb_bulg.copy()

In [62]:
data_final['y'].value_counts()

X = data_final.loc[:, data_final.columns != 'y']
y = data_final.loc[:, data_final.columns == 'y']

from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=15)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=16)



columns = X_train.columns

os_data_X,os_data_y=os.fit_sample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])

# we can Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print("Number of no subscription in oversampled data",len(os_data_y[os_data_y['y']==0]))
print("Number of subscription",len(os_data_y[os_data_y['y']==1]))
print("Proportion of no subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==0])/len(os_data_X))
print("Proportion of subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==1])/len(os_data_X))

import statsmodels.api as sm
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())

length of oversampled data is  3002
Number of no subscription in oversampled data 1501
Number of subscription 1501
Proportion of no subscription data in oversampled data is  0.5
Proportion of subscription data in oversampled data is  0.5
         Current function value: 0.576294
         Iterations: 35


LinAlgError: Singular matrix