In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_selection import mutual_info_classif,chi2
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import roc_auc_score, mean_squared_error

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import CondensedNearestNeighbour

import category_encoders as ce

plt.style.use('seaborn-colorblind')
%matplotlib inline

In [2]:
use_cols = [
    'age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race',
    'sex','capital-gain','capital-loss','hours-per-week','native-country','makes_over_50K_a_year'
]

df = pd.read_csv('data/adult.data', names = use_cols, sep=', ')
df.to_csv("feature_engineering.csv", sep=',')
print(df.shape)
df.head(5)

  


(32561, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,makes_over_50K_a_year
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Проверим пропущенные значения

In [3]:
def check_missing(df,output_path=None):    
    result = pd.concat([df.isnull().sum(),df.isnull().mean()],axis=1)
    result = result.rename(index=str,columns={0:'total missing',1:'proportion'})
    if output_path is not None:
        result.to_csv(output_path+'missing.csv')
        print(output_path, 'missing.csv')
    return result

In [4]:
check_missing(df=df)

Unnamed: 0,total missing,proportion
age,0,0.0
workclass,0,0.0
fnlwgt,0,0.0
education,0,0.0
education-num,0,0.0
marital-status,0,0.0
occupation,0,0.0
relationship,0,0.0
race,0,0.0
sex,0,0.0


упс

## Выбросы

In [5]:
def outlier_detect_arbitrary(df,col,upper_fence,lower_fence):
    para = (upper_fence, lower_fence)
    tmp = pd.concat([df[col]>upper_fence,df[col]<lower_fence],axis=1)
    outlier_index = tmp.any(axis=1)
    print('Количество выбросов в данных:',outlier_index.value_counts()[1])
    print('Доля выбросов:',outlier_index.value_counts()[1]/len(outlier_index))    
    return outlier_index, para

In [6]:
index,para = outlier_detect_arbitrary(df=df,col='age',upper_fence=100,lower_fence=5)
print('Верхняя граница:',para[0],'\nНижняя граница:',para[1])

KeyError: 1

In [None]:
not_null_columns = []
for j in df.select_dtypes(exclude=['object']).columns.values:
    not_null_columns.append(j)
print(not_null_columns)

In [None]:
for j in not_null_columns:
    plt.figure()
    plt.title(j)
    sns.boxplot(data=df[j])

In [None]:
for j in df:
    uniq = np.unique(df[j].values)
    print(j+": "+str(len(uniq))+"\n"+str(uniq)+"\n")

In [None]:
df.drop('fnlwgt',axis = 1)

In [None]:
to_review = []
to_review.append(["age", 17]) 
to_review.append(["occupation", "?"]) 
to_review.append(["workclass", "?"])
to_review.append(["capital-gain", 0])
to_review.append(["capital-gain", 99999])
to_review.append(["capital-loss", 0])
to_review.append(["native-country", "?"])

for j in to_review:
    print(j, (df[j[0]] == j[1]).sum())
df.shape