In [None]:
import pandas
import warnings
from networkx import visibility_graph
from numba.np.npdatetime import add_constant
from sklearn.linear_model import LogisticRegression
from sphinx.addnodes import index, seealso
from sympy.physics.quantum.matrixutils import sparse
from sympy.stats.sampling.sample_numpy import numpy
from twisted.internet.ssl import supported
warnings.filterwarnings('ignore')
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# We are using the dataset on which data preprocessing was performed in the previous tasks of the internship program
df=pandas.read_csv('pro_1.csv')

In [None]:
# Our target variable is 'status' column. We have to classify a company as 'operating', 'acquired', 'ipo' or 'closed'
# It is a classification model

In [None]:
# Understanding the dataset
print('Number of rows : ',len(df))
print('Number of columns : ',len(df.columns))
print('All the features of the dataset : ')
print(df.columns)
print('Datatype of each feature : ')
print(df.dtypes)
print('Unique datatypes : ')
print(df.dtypes.unique())
fl=df.select_dtypes(include=float)
bf=df.select_dtypes(include=bool)
of=df.select_dtypes(object)
print('Number of Float features : ',len(fl.columns))
print('Number of Boolean features : ',len(bf.columns))
print('Number of Object features : ',len(of.columns))

In [None]:
# Irrelevant features were removed in the previous tasks of the internship
# This is a snippet from previous task
# company = pandas.read_csv("companies.csv")
# company.drop('region',axis=1,inplace=True)
# company.drop('city',axis=1,inplace=True)
# company.drop(company.columns['Unnamed: 0.1', 'entity_type', 'entity_id', 'parent_id', 'created_by', 'created_at', 'updated_at'],axis=1,inplace=True)
# company.drop(company.columns['domain','homepage_url', 'twitter_username', 'logo_url', 'logo_width', 'logo_height', 'short_description', 'description', 'overview', 'tag_list', 'name', 'normalized_name', 'permalink', 'invested_companies'],axis=1,inplace=True)

In [None]:
# Handling multi collinearity
# Removing features with high collinearity
print(df.corr().to_string())
df_for_cor=df.drop('status',axis=1)
cm=df_for_cor.corr()
mask=numpy.abs(cm)>=0.85
numpy.fill_diagonal(mask.values,False)
high_corr=[column for column in mask.columns if any(mask[column])]
print(high_corr)

In [None]:
# Variance Inflation Factor
bf=df.select_dtypes(include=bool)
b_cols=bf.columns.values
print(b_cols)
df_for_vif=df.drop(b_cols,axis=1)
df_for_vif.drop('isClosed',axis=1,inplace=True)
df_for_vif.dropna(inplace=True)
print(df_for_vif.head(10).to_string())
vif_vals=pandas.Series([variance_inflation_factor(df_for_vif.values,i) for i in range(df_for_vif.shape[1])], index=df_for_vif.columns)
vif_vals=vif_vals/pow(10,6)
print(vif_vals)
print(vif_vals[vif_vals>8])

In [None]:
# Mutual Information to understand which features are important
new_df=pandas.read_csv('pro_1.csv')
new_df=new_df.replace({True:1,False:0})
new_df.drop('active_days',axis=1,inplace=True)
X=new_df.drop('status', axis=1)
Y=new_df['status']
from sklearn.feature_selection import mutual_info_classif
mi=mutual_info_classif(X,Y,discrete_features=True)
mi_df=pandas.DataFrame({'Feature': X.columns, 'Mutual Information':mi})
print(mi_df.to_string())

In [None]:
# Random Forest Feature Selection via Permutation Importance
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,Y,train_size=0.8)
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(x_train,y_train)
importances=rf.feature_importances_
per_imp=pandas.DataFrame({
    'Feature':X.columns,
    'Permutation Importance':importances
})
print(per_imp.sort_values(by='Permutation Importance',ascending=False))

In [None]:
# Min Max Scaling
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
x_scaled=scaler.fit_transform(X)
x_scaled_df=pandas.DataFrame(x_scaled,columns=X.columns)
print(x_scaled_df)

In [None]:
# Log Transformation
X_log_transformed = X.apply(lambda x: numpy.log(x + 1))
l_t_df=pandas.DataFrame(X_log_transformed, columns=X.columns)
print(l_t_df)

In [None]:
# Robust Scaling
from sklearn.preprocessing import RobustScaler
rs=RobustScaler()
X_robust_scaled=rs.fit_transform(X)
X_robust_scaled_df=pandas.DataFrame(X_robust_scaled,columns=X.columns)
print(X_robust_scaled_df)

In [None]:
# PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)
x_pcad=pca.fit_transform(x_scaled)
print(x_pcad)