### Number of Outliers

In [1]:
def number_of_outliers(df):
    IQR = df.quantile(0.75) - df.quantile(0.25)
    Number = ((df > (df.quantile(0.75)+(1.5*IQR))) | (df < (df.quantile(0.25)-(1.5*IQR)))).sum()
    Outlier = ((df > (df.quantile(0.75)+(1.5*IQR))) | (df < (df.quantile(0.25)-(1.5*IQR))))
    dataframe = pd.DataFrame({"Number of Outliers":Number})
    return (dataframe.sort_values(by="Number of Outliers",ascending=False))

### Percentage of null values

In [1]:
def perc_null_values(df):
    null_value_features=list(df.isnull().sum()[df.isnull().sum()!=0].index)
    perc_null=(df[null_value_features].isnull().sum()/df.shape[0])*100
    perc_null=pd.DataFrame(perc_null)
    plt.figure(figsize=(6,4))
    sns.barplot(x=perc_null.index,y=perc_null[0],order=perc_null.sort_values(by=0,ascending=False).index)
    for (i,j) in enumerate(perc_null.sort_values(by=0,ascending=False)[0].values):
        plt.text(x=i-0.3,y=j+0.7,s="{:.1f}%".format(j),color="black",fontsize=10,fontweight="bold")
    plt.xticks(rotation=0)
    plt.title('Percentage of null values in the features having null values',fontdict={'fontsize':14,'weight':'bold'})
    plt.xlabel('Features',fontdict={'fontsize':10,'weight':'bold'})
    plt.ylabel('Null value percentage',fontdict={'fontsize':10,'weight':'bold'})
    plt.show()

### Number of unique values

In [2]:
def num_unique_values(df):
    data = []
    for i in df.columns:
         data.append(df[i].nunique())
    s=pd.DataFrame({"Name of column":df.columns,"Number of unique values":data})
    return s

### Using subplots to make multiple boxplots and distplots

In [None]:
fig,axes=plt.subplots(nrows=2,ncols=3,figsize=(15,10))
a=sns.distplot(pima["Glucose"],kde=False,ax=axes[0,0])
a.set_title("Histogram of Glucose",fontsize=14)
b=sns.distplot(pima["BloodPressure"],kde=False,ax=axes[0,1])
b.set_title("Histogram of BloodPressure",fontsize=14)
c=sns.distplot(pima["BMI"],kde=False,ax=axes[0,2])
c.set_title("Histogram of BMI",fontsize=14)
d=sns.boxplot(y="Glucose",data=pima,ax=axes[1,0])
d.set_title("Boxplot of Glucose",fontsize=14)
e=sns.boxplot(y="BloodPressure",data=pima,ax=axes[1,1])
e.set_title("Boxplot of BloodPressure",fontsize=14)
f=sns.boxplot(y="BMI",data=pima,ax=axes[1,2])
f.set_title("Boxplot of BMI",fontsize=14)
plt.tight_layout()

### Feature importance plot

In [None]:
k = dt_model.feature_importance
x=pd.DataFrame(k*100,index=X_train.columns).sort_values(by=0,ascending=False)
plt.figure(figsize=(12,7))
sns.barplot(x[0],x.index,palette='rainbow')
plt.ylabel('Feature Name')
plt.xlabel('Feature Importance in %')
plt.title('Feature Importance Plot')
plt.show()

### Heatmap of Confusion matrix

In [None]:
sns.heatmap(confusion_matrix(y_train,y_pred_train1),annot=True,fmt="d",cbar=False,cmap="YlGnBu")
plt.ylabel("Actual")
plt.xlabel("Predicted")

df_cm = pd.DataFrame(data=confusion_matrix(Y_test,Y_test_pred),columns = ["Predicted : 0","Predicted : 1"],index=["Actual : 0","Actual : 1"])

### VIF

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns

In [None]:
import statsmodels.formula.api as SM
def vif_cal(input_data):
    x_vars=input_data
    xvar_names=input_data.columns
    for i in range(0,xvar_names.shape[0]):
        y=x_vars[xvar_names[i]] 
        x=x_vars[xvar_names.drop(xvar_names[i])]
        rsq=SM.ols(formula="y~x", data=x_vars).fit().rsquared  
        vif=round(1/(1-rsq),2)
        print (xvar_names[i], " VIF = " , vif)

### When transforming variables to log

In [None]:
df["number_of_reviews"] = np.where(df["number_of_reviews"].isin([-np.inf,np.inf]),0,df["number_of_reviews"])

### Linear regression code

In [None]:
## We will put this in a function..

def lin_reg_model(X_features,y, 
                  return_coef = False):
    
    X = zomato_df[X_features]
    # convert categorical to dummy
    X = pd.get_dummies(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)
    
    # Training the model
    regression_model = LinearRegression()
    regression_model.fit(X_train, y_train)
    
    # predict
    y_test_pred = regression_model.predict(X_test)

    from sklearn.metrics import mean_squared_error, r2_score
    print("training r-square value is ",regression_model.score(X_train, y_train))
    print("test r-square value is ", r2_score(y_test, y_test_pred))
    print("rmse is ",np.sqrt(mean_squared_error(y_test, y_test_pred)))
    
    # return model coeff if flag is set to yes
    if(return_coef):
        ## extract coeff
        model_coef = pd.DataFrame({"features":X_train.columns,
                           "coefficients":regression_model.coef_})
        return model_coef

### Outlier treatment using IQR technique

In [1]:
def capping_outliers(cont2_df):
    def upper_range(k):
        IQR1 = k.quantile(0.75) - k.quantile(0.25)
        ur=k.quantile(0.75)+(1.5*IQR1)
        return ur
    
    def lower_range(k):
        IQR1 = k.quantile(0.75) - k.quantile(0.25)
        ur=k.quantile(0.25)-(1.5*IQR1)
        return lr
    
    for i in cont2_df.columns:
        ur=upper_range(cont2_df[i])
        lr=lower_range(cont2_df[i])
        cont2_df[i]=np.where(cont2_df[i]>ur,ur,cont2_df[i])
        cont2_df[i]=np.where(cont2_df[i]<lr,lr,cont2_df[i])
    return cont2_df

### For removing any elements from a list

In [None]:
def remove_from_a_list(list,to_be_removed):
    for i in to_be_removed:
        list.remove(i)

### Converting Ols table to a dataframe

In [2]:
def generating_coef_table(lr_mod):
  coef_table =pd.read_html(lr_mod.summary2().as_html())[1]
  coef_table.drop(0,axis=0,inplace=True)
  coef_table.columns = ["var_name","coef","std_err","t_stat","p_val","cof_25","conf_975"]
  sig_features = list(coef_table.loc[coef_table["p_val"].astype('float') <= 0.05, "var_name"].values)
  sig_features = [x for x in sig_features if x != "Intercept"]
  return sig_features

### Replace function

In [None]:
df.CreditHistory.replace(to_replace='critical', value=0, inplace=True)
df.CreditHistory.replace(to_replace='poor', value=1, inplace=True)
df.CreditHistory.replace(to_replace='good', value=2, inplace=True)
df.CreditHistory.replace(to_replace='verygood', value=3, inplace=True)

### ROC & AUC

In [None]:
# AUC and ROC for the training data


# calculate AUC
auc = roc_auc_score(Y_Test,y_test_predict_prob[:, 1])#keeping only the probabilities for the desired class outcome
print('AUC: %.3f' % auc)
# # calculate roc curve
# from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(Y_Test,y_test_predict_prob[:, 1])#keeping only the probabilities for the desired 
#class outcome
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
# show the plot
plt.show()

### Search and replace

In [None]:
df.replace(" ","_",regex=True,inplace=True)

### Converting the datatype into number

In [None]:
df["column name"] = pd.to_numeric(df["column name"])

### Percentage in groupby

In [None]:
dfs=data.groupby(["Ideal number of boys","Blood sample"]).count().reset_index()
dfs=dfs.iloc[:,[0,1,2]]

In [2]:
def percent_groupby(dfs):
    percent=[]
    for i in (dfs[dfs.columns[0]].unique()):
        value0=dfs[(dfs[dfs.columns[0]]==i) & (dfs[dfs.columns[1]]==0)][dfs.columns[2]].reset_index().iloc[0,1]
        value1=dfs[(dfs[dfs.columns[0]]==i) & (dfs[dfs.columns[1]]==1)][dfs.columns[2]].reset_index().iloc[0,1]
        percent0=(value0/(value0+value1))*100
        percent.append(percent0)
        percent1=(value1/(value0+value1))*100
        percent.append(percent1)

### Column separation

In [1]:
def col_sep(df):
    numerical=[]
    categorical=[]
    for i in df.columns:
        if df[i].dtypes=="object":
            categorical.append(i)
        else:
            numerical.append(i)

### Columns with null values greater than 80%

In [5]:
import seaborn as sns

In [6]:
mpg = sns.load_dataset("mpg")

In [8]:
def col_null_greater_than_80(df):
    s=(df.isnull().sum()/df.shape[0])>80
    return s[s].index

#### Note about datetime module: Two digits year ambiguity. So it seems that anything with the %y year below 69 will be attributed a century of 2000, and 69 upwards get 1900. 

In [None]:
datetime.datetime.strptime('31-Dec-68', '%d-%b-%y').date()
>>> datetime.date(2068, 12, 31)

datetime.datetime.strptime('1-Jan-69', '%d-%b-%y').date()
>>> datetime.date(1969, 1, 1)


In [2]:
##Code to fix this is given below

In [1]:
import datetime

def fix_date(x):

    if x.year >= 2040:

        year = x.year - 100

    else:

        year = x.year

    return datetime.date(year,x.month,x.day)

#### Separating categorical and numerical columns easy way!

In [None]:
list(set(df.dtypes.tolist()))
df_num = df.select_dtypes(include=["float64","int64"])

In [1]:
def train_cont_col_func(df):
    return df.select_dtypes(include=["float64","int64"]).columns

In [2]:
def train_cat_col_func(df):
    return df.select_dtypes(include=["O"]).columns