In [1]:
from scipy.io import arff
import pandas as pd

## 0. Preprocessing

In [2]:
#Preprocess the data: loading to dataframe and creating bankruptcy column
data = arff.loadarff('4year.arff')
df = pd.DataFrame(data[0])
df['bankruptcy'] = (df['class']==b'1')

In [3]:
df.head()

Unnamed: 0,Attr1,Attr2,Attr3,Attr4,Attr5,Attr6,Attr7,Attr8,Attr9,Attr10,...,Attr57,Attr58,Attr59,Attr60,Attr61,Attr62,Attr63,Attr64,class,bankruptcy
0,0.15929,0.4624,0.07773,1.1683,-44.853,0.46702,0.18948,0.82895,1.1223,0.3833,...,0.41557,0.89101,0.001422,7.7928,4.9914,119.81,3.0465,3.056,b'0',False
1,-0.12743,0.46243,0.26917,1.7517,7.597,0.000925,-0.12743,1.1625,1.2944,0.53757,...,-0.23704,1.0625,0.15041,5.4327,3.4629,100.97,3.615,3.4725,b'0',False
2,0.070488,0.2357,0.52781,3.2393,125.68,0.16367,0.086895,2.8718,1.0574,0.67689,...,0.10413,0.94571,0.0,7.107,3.3808,76.076,4.7978,4.7818,b'0',False
3,0.13676,0.40538,0.31543,1.8705,19.115,0.50497,0.13676,1.4539,1.1144,0.58938,...,0.23203,0.89737,0.073024,6.1384,4.2241,88.299,4.1337,4.6484,b'0',False
4,-0.11008,0.69793,0.18878,1.2713,-15.344,0.0,-0.11008,0.43282,1.735,0.30207,...,-0.3644,0.57153,0.0,18.801,2.7925,146.39,2.4934,15.036,b'0',False


## 1. Create a new dataframe with only 4 features (and and Bankruptcy). Properly rename the columns to X1, X2, X7, and X10

In [4]:
features_df = pd.DataFrame()
features_df['X1'] = df['Attr1']
features_df['X2'] = df['Attr2']
features_df['X7'] = df['Attr7']
features_df['X10'] = df['Attr10']
features_df['Bankruptcy'] = df['bankruptcy']

In [5]:
features_df.head(10)

Unnamed: 0,X1,X2,X7,X10,Bankruptcy
0,0.15929,0.4624,0.18948,0.3833,False
1,-0.12743,0.46243,-0.12743,0.53757,False
2,0.070488,0.2357,0.086895,0.67689,False
3,0.13676,0.40538,0.13676,0.58938,False
4,-0.11008,0.69793,-0.11008,0.30207,False
5,0.021539,0.58425,0.029628,0.41575,False
6,0.22743,0.52266,0.283,0.47734,False
7,0.038662,0.59498,0.038662,0.40502,False
8,0.13103,0.47202,0.16378,0.52798,False
9,0.17698,0.19359,0.21281,0.80641,False


## 2. Fill-in the missing values na with the mean. 

In [6]:
features_df = features_df.fillna(features_df.mean())
features_df.head(10)

Unnamed: 0,X1,X2,X7,X10,Bankruptcy
0,0.15929,0.4624,0.18948,0.3833,False
1,-0.12743,0.46243,-0.12743,0.53757,False
2,0.070488,0.2357,0.086895,0.67689,False
3,0.13676,0.40538,0.13676,0.58938,False
4,-0.11008,0.69793,-0.11008,0.30207,False
5,0.021539,0.58425,0.029628,0.41575,False
6,0.22743,0.52266,0.283,0.47734,False
7,0.038662,0.59498,0.038662,0.40502,False
8,0.13103,0.47202,0.16378,0.52798,False
9,0.17698,0.19359,0.21281,0.80641,False


## 3. Find the mean and std of the 4 features among all, bankrupt and still-operating companies (3 groups).

### 3.1  All  companies

In [7]:
mean_1 = features_df.iloc[:,:-1].mean()
std_1 = features_df.iloc[:,:-1].std()

In [8]:
#Result 1: mean of all companies
mean_1

X1     0.043019
X2     0.596404
X7     0.059446
X10    0.389040
dtype: float64

In [9]:
#Result 2: std of all companies
std_1

X1     0.359303
X2     4.586887
X7     0.533317
X10    4.590064
dtype: float64

### 3.2 Bankrupt companies

In [11]:
#choose out the bankrupt companies
temp = features_df[features_df['Bankruptcy'] == True]

In [12]:
mean_2 = temp.iloc[:, :-1].mean()
std_2 = temp.iloc[:, :-1].std()

In [13]:
#Result 1: mean of bankrupt companies
mean_2

X1    -0.068873
X2     0.878355
X7    -0.061538
X10    0.103367
dtype: float64

In [14]:
#Result 2: std of bankrupt companies
std_2

X1     0.568076
X2     1.945596
X7     0.568432
X10    1.946747
dtype: float64

### 3.3 Still-operating companies

In [15]:
#choose out the still-operating companies
temp = features_df[features_df['Bankruptcy'] == False]

In [16]:
mean_3 = temp.iloc[:, :-1].mean()
std_3 = temp.iloc[:, :-1].std()

In [17]:
#Result 1: mean of still-operating companies
mean_3

X1     0.049231
X2     0.580752
X7     0.066162
X10    0.404899
dtype: float64

In [18]:
#Result 2: std of still-operating companies
std_3

X1     0.343002
X2     4.689694
X7     0.530524
X10    4.692934
dtype: float64

## 4. How many companies satisfy the condition, X1 < mean(X1) - stdev(X1) AND X10 < mean(X10) - std(X10)?

In [19]:
features_df.head()

Unnamed: 0,X1,X2,X7,X10,Bankruptcy
0,0.15929,0.4624,0.18948,0.3833,False
1,-0.12743,0.46243,-0.12743,0.53757,False
2,0.070488,0.2357,0.086895,0.67689,False
3,0.13676,0.40538,0.13676,0.58938,False
4,-0.11008,0.69793,-0.11008,0.30207,False


In [20]:
#choose out the companies that satisfy the condition
threshold_X1 = mean_1['X1'] - std_1['X1']
threshold_X10 = mean_1['X10'] - std_1['X10']
temp = features_df[(features_df['X1'] < threshold_X1) & (features_df['X10'] < threshold_X10)]

In [21]:
num = temp.shape[0]
print('There are %d companies that satisfy the condition.' % num)

There are 15 companies that satisfy the condition.


## 5. What is the ratio of the bankrupted companies among the sub-groups above?

In [22]:
#calc the number of bankrupted companies
bankrupt_num = (temp['Bankruptcy'] == True).sum()
bankrupt_num

3

In [23]:
#calc the ratio
bankrupt_ratio = 1.0 * bankrupt_num / num
bankrupt_ratio

0.2

In [24]:
print('The ratio of the bankrupted companies among the sub-groups above is %.2f%%.' % (bankrupt_ratio * 100))

The ratio of the bankrupted companies among the sub-groups above is 20.00%.
