In [1]:
from scipy.io import arff
import pandas as pd

In [2]:
data = arff.loadarff('4year.arff')
df = pd.DataFrame(data[0])

In [3]:
df['Bankruptcy'] = (df['class'] == b'1')

In [4]:
df.head()

Unnamed: 0,Attr1,Attr2,Attr3,Attr4,Attr5,Attr6,Attr7,Attr8,Attr9,Attr10,...,Attr57,Attr58,Attr59,Attr60,Attr61,Attr62,Attr63,Attr64,class,Bankruptcy
0,0.15929,0.4624,0.07773,1.1683,-44.853,0.46702,0.18948,0.82895,1.1223,0.3833,...,0.41557,0.89101,0.001422,7.7928,4.9914,119.81,3.0465,3.056,b'0',False
1,-0.12743,0.46243,0.26917,1.7517,7.597,0.000925,-0.12743,1.1625,1.2944,0.53757,...,-0.23704,1.0625,0.15041,5.4327,3.4629,100.97,3.615,3.4725,b'0',False
2,0.070488,0.2357,0.52781,3.2393,125.68,0.16367,0.086895,2.8718,1.0574,0.67689,...,0.10413,0.94571,0.0,7.107,3.3808,76.076,4.7978,4.7818,b'0',False
3,0.13676,0.40538,0.31543,1.8705,19.115,0.50497,0.13676,1.4539,1.1144,0.58938,...,0.23203,0.89737,0.073024,6.1384,4.2241,88.299,4.1337,4.6484,b'0',False
4,-0.11008,0.69793,0.18878,1.2713,-15.344,0.0,-0.11008,0.43282,1.735,0.30207,...,-0.3644,0.57153,0.0,18.801,2.7925,146.39,2.4934,15.036,b'0',False


In [5]:
df.shape[1]

66

# Choose four features

In [6]:
df1 = df.iloc[:,[0,1,6,9,65]]

# Rename columns

In [7]:
df1.columns = ['x1','x2','x7','x10','Bankruptcy']

In [8]:
df1.head()

Unnamed: 0,x1,x2,x7,x10,Bankruptcy
0,0.15929,0.4624,0.18948,0.3833,False
1,-0.12743,0.46243,-0.12743,0.53757,False
2,0.070488,0.2357,0.086895,0.67689,False
3,0.13676,0.40538,0.13676,0.58938,False
4,-0.11008,0.69793,-0.11008,0.30207,False


In [9]:
df1.isnull().any()

x1             True
x2             True
x7             True
x10            True
Bankruptcy    False
dtype: bool

# Fill in missing values with mean

In [10]:
from sklearn.preprocessing import Imputer

imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
imr = imr.fit(df1.values)
imputed_data = imr.transform(df1.values)
newdf = pd.DataFrame(data = imputed_data, columns = df1.columns)

In [11]:
newdf['Bankruptcy'] = (newdf['Bankruptcy'] == 1)

In [12]:
newdf.head()

Unnamed: 0,x1,x2,x7,x10,Bankruptcy
0,0.15929,0.4624,0.18948,0.3833,False
1,-0.12743,0.46243,-0.12743,0.53757,False
2,0.070488,0.2357,0.086895,0.67689,False
3,0.13676,0.40538,0.13676,0.58938,False
4,-0.11008,0.69793,-0.11008,0.30207,False


In [13]:
newdf.isnull().any()

x1            False
x2            False
x7            False
x10           False
Bankruptcy    False
dtype: bool

# Find the bankrupt group

In [14]:
bankrupt = newdf[newdf['Bankruptcy'] == 1]

In [15]:
bankrupt.head()

Unnamed: 0,x1,x2,x7,x10,Bankruptcy
9277,0.2115,1.334,0.2115,-0.33423,True
9278,-0.12959,0.9963,-0.12959,0.003893,True
9279,0.064481,0.64146,0.071695,0.35854,True
9280,-0.081825,1.1005,-0.081825,-0.10048,True
9281,0.10218,0.42492,0.12692,0.57498,True


# Find the still-operating group

In [16]:
still_operating = newdf[newdf['Bankruptcy'] == 0]

In [17]:
still_operating.head()

Unnamed: 0,x1,x2,x7,x10,Bankruptcy
0,0.15929,0.4624,0.18948,0.3833,False
1,-0.12743,0.46243,-0.12743,0.53757,False
2,0.070488,0.2357,0.086895,0.67689,False
3,0.13676,0.40538,0.13676,0.58938,False
4,-0.11008,0.69793,-0.11008,0.30207,False


# Mean and std of the four features among all

In [18]:
newdf.describe()

Unnamed: 0,x1,x2,x7,x10
count,9792.0,9792.0,9792.0,9792.0
mean,0.043019,0.596404,0.059446,0.38904
std,0.359303,4.586887,0.533317,4.590064
min,-12.458,0.0,-12.458,-445.91
25%,0.001322,0.263163,0.003005,0.29451
50%,0.041374,0.46777,0.04884,0.510435
75%,0.111125,0.689183,0.12693,0.714285
max,20.482,446.91,38.618,12.602


# Mean and std of the four features among bankrupted companies

In [19]:
bankrupt.describe()

Unnamed: 0,x1,x2,x7,x10
count,515.0,515.0,515.0,515.0
mean,-0.068873,0.878355,-0.061538,0.103367
std,0.568076,1.945596,0.568432,1.946747
min,-4.0506,0.000357,-4.0506,-39.156
25%,-0.14551,0.458885,-0.14259,0.1056
50%,0.0,0.67103,0.00049,0.30583
75%,0.04464,0.88285,0.050749,0.51656
max,9.8037,40.157,9.8037,0.99945


# Mean and std of the four features among still-operating companies

In [20]:
still_operating.describe()

Unnamed: 0,x1,x2,x7,x10
count,9277.0,9277.0,9277.0,9277.0
mean,0.049231,0.580752,0.066162,0.404899
std,0.343002,4.689694,0.530524,4.692934
min,-12.458,0.0,-12.458,-445.91
25%,0.003128,0.25722,0.004711,0.3064
50%,0.043545,0.4575,0.051804,0.51967
75%,0.11314,0.67485,0.12939,0.72139
max,20.482,446.91,38.618,12.602


# Choose companies under given condition

In [21]:
import numpy as np

In [22]:
 m1 = np.mean(newdf['x1'])

In [23]:
m10 = np.mean(newdf['x10'])

In [24]:
s1 = np.std(newdf['x1'])

In [25]:
s10 = np.std(newdf['x10'])

In [26]:
data = newdf[newdf['x1'] < m1 - s1]

In [27]:
data =  data[data['x10'] < m10 - s10]

In [28]:
data.describe()

Unnamed: 0,x1,x2,x7,x10
count,15.0,15.0,15.0,15.0
mean,-2.462179,12.950687,-2.456799,-11.977953
std,2.486229,9.630698,2.483392,9.63142
min,-9.2981,5.2632,-9.2981,-39.156
25%,-3.5038,6.32465,-3.46345,-14.753
50%,-1.3743,9.6992,-1.3743,-8.6992
75%,-0.7272,15.753,-0.7272,-5.32465
max,-0.32841,40.157,-0.32841,-4.2632


## 15 companies satisify the given condition and 20% of them bankrupted

In [29]:
print(data)

           x1       x2       x7      x10  Bankruptcy
2312 -1.09270   5.6368 -1.09270  -4.6368       False
2608 -3.72310  11.5300 -3.64240 -10.5300       False
3017 -1.94800  25.0050 -1.94800 -24.0050       False
3739 -0.72685   6.9334 -0.72685  -5.9334       False
4767 -5.96550   6.6818 -5.96550  -5.6818       False
5001 -3.28450  20.4030 -3.28450 -19.4030       False
5259 -0.44000  16.4870 -0.44000 -15.4870       False
5859 -0.32841   6.1187 -0.32841  -5.1187       False
6264 -0.72755   5.2632 -0.72755  -4.2632       False
7846 -1.98410  13.0630 -1.98410 -12.4730       False
8405 -9.29810   9.6992 -9.29810  -8.6992       False
8535 -1.37430   5.7326 -1.37430  -4.7326       False
9584 -4.05060   6.5306 -4.05060  -5.5306        True
9587 -0.65997  40.1570 -0.65997 -39.1560        True
9662 -1.32900  15.0190 -1.32900 -14.0190        True


In [33]:
data[data['Bankruptcy'] == 1].shape[0]

3