# EDA

Contents of the Notebook:

Exploratory Data Analysis(EDA):

1. Analysis of the features.
2. Finding any relation or treds considering multiple features.
3. Filling Null Values.
4. Transforming Variable for Model consumption

### Exploratory Data Analysis(EDA)

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [4]:
def iqr(x):
    return x.quantile(q=0.75) - x.quantile(q=0.25)

## Outliers > 75th , tile + 1.5IQR & < 25th tile - 1.5IQR

def outlier_count(x):
    upper_out = x.quantile(q=0.75) + 1.5 * iqr(x)
    lower_out = x.quantile(q=0.25) - 1.5 * iqr(x)
    
    return len(x[x > upper_out]) + len(x[x < lower_out])
    

In [16]:
data = pd.read_csv(r'D:\EDA\titanic\train.csv')

In [17]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Embarked,WikiId,Name_wiki,Age_wiki,Hometown,Boarded,Destination,Lifeboat,Body,Class
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,...,S,691.0,"Braund, Mr. Owen Harris",22.0,"Bridgerule, Devon, England",Southampton,"Qu'Appelle Valley, Saskatchewan, Canada",,,3.0
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,C,90.0,"Cumings, Mrs. Florence Briggs (née Thayer)",35.0,"New York, New York, US",Cherbourg,"New York, New York, US",4,,1.0
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,...,S,865.0,"Heikkinen, Miss Laina",26.0,"Jyväskylä, Finland",Southampton,New York City,14?,,3.0
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,...,S,127.0,"Futrelle, Mrs. Lily May (née Peel)",35.0,"Scituate, Massachusetts, US",Southampton,"Scituate, Massachusetts, US",D,,1.0
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,...,S,627.0,"Allen, Mr. William Henry",35.0,"Birmingham, West Midlands, England",Southampton,New York City,,,3.0


In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    float64
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
 12  WikiId       889 non-null    float64
 13  Name_wiki    889 non-null    object 
 14  Age_wiki     887 non-null    float64
 15  Hometown     889 non-null    object 
 16  Boarded      889 non-null    object 
 17  Destination  889 non-null    object 
 18  Lifeboat     345 non-null    object 
 19  Body    

In [32]:
def data_information(df, id_cols):
    
    ## Removing ID columns
    df = df.drop(columns = id_cols)
    
    ## Creating Empty Data Frame
    data_info = pd.DataFrame(np.random.randn(0, 12) * 0 , 
                             columns = ['No. of Observations (Nrow)',
                                        'No. of Numeric Variable',
                                        'No of Factor Variables',
                                        'No. of Categorical Variables',
                                        'No. of Logical Variable',
                                        'No. of Date Variable',
                                        'No of Zero variance Variables (Uniform)',
                                        '% of Variables having complete cases', 
                                        '% of Variables having <= 50% missing cases',
                                        '% of Variables having >50% missing cases',
                                        '% of Variables having >90% missing cases'])
    
    
    ## Data Information
    
    data_info.loc[0, 'No. of Observations (Nrow)'] = df.shape[0]
    data_info.loc[0, 'No. of Variables (Ncol)'] = df.shape[1]
    data_info.loc[0, 'No. of Numerical Variable'] = df._get_numeric_data().shape[1]
    data_info.loc[0, 'No. of Factor Variables'] = df.select_dtypes(include = 'category').shape[1]
    data_info.loc[0, 'No. of Logical Variables'] = df.select_dtypes(include = 'bool').shape[1]
    data_info.loc[0, 'No. of Categorical Variables'] = df.select_dtypes(include = 'object').shape[1]
    data_info.loc[0, 'No. of Date Variable'] = df.select_dtypes(include = 'datetime64').shape[1]
    data_info.loc[0, 'No. of Zero Variance Variables (Uniform)' ] = df.loc[:, df.apply(pd.Series.numique)== 1 ].shape[1]
                                                                                       
    null_per = pd.DataFrame(df.isnull().sum() / df.shape[0])
    null_per.columns = ['null_per'
    
                        
    data_info.loc[0, '% of Variables having complete cases'] = null_per[null_per]=[ 0 ]hape[0]*100
    data_info.loc[0, '% of Variables having <=50% missing cases'] = null_per[null_per.null_per<= 0.50].shape[0]*100
    data_info.loc[0, '% of Variables having >50% missing cases'] = null_per[null.per.null_per > 0.5].shape[0]*100
    data_info.loc[0, '% of Variables having >90% missing cases ] = null_per[null_per.null_per > 0.9].shape[0]*100
                  
                  
     ## Transposing Data to get in consumable format
                  
     data_info = data_info.transpose()
     data_info.columns = ['Value']
     data['Value'] = data_info['Value'].astype(int)
     
    
     return data_info-input-26-154714532f69>", line 36

SyntaxError: invalid syntax (<ipython-input-32-d78ada7b36c4>, line 36)