# Crowd Funding Study Data Cleaning

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt


In [None]:
pd.set_option('display.max_columns', None) #display all columns in jupyter
df= pd.read_excel('scrapedCrowdOfferingsData.xlsx',index_col=None)

In [None]:
df.dtypes

A lot of the object column types e.g. compensationAmount, financialInterest, intuitively should be numerical columns so let's look into object columns with more detail

In [None]:
df.select_dtypes(['object'])

In [None]:
df[~(df['financialInterest'].isna() | (df['financialInterest']== 'None.') | (df['financialInterest']== 'No') )]
df.head(5)

In [None]:
df[df['progressUpdate'].notna()].head(5)

- date column needs to be converted to datetime
- Seems like compensation amount, priceDeterminationMethod is in text form, and there is no clear uniform consistency to extract features from.
- The financial interest is also in text form, but a derived boolean column in the form of yes/no to include this feature in model if need be later while modeling
- Cardinality of objectcolumns needs to be determined in order to determine their usefulness for building a model
- Progress update is in text form but seems like this is where the data for raisedOfferingAmount would come for after some processing

In [None]:
pd.set_option('display.max_rows', 50) #display 100 rows
df['dateIncorporation']= pd.to_datetime(df['dateIncorporation']).dt.date #converting to datetime type
cardinality= df.select_dtypes(['object']).nunique()
print(cardinality)


In [None]:
pd.set_option('display.max_rows', None) #display 100 rows
(df.isna().sum()/len(df))*100 #percentage of nan values in each column

### Interesting observations
- each of the financials have 6.5% of the total values are Nan indicated a system trend
- No missing values in isAmendment, submissionType, so these can be used as filters
- issuerCIK and issuerName are both have same percentage of missing values, indicating that in a record both these fields are likely missing together
- Some categorical columns can be dropped if cardinality or if the no of missing values are high: ['compensationAmount', 'financialInterest',
       'securityOfferedOtherDesc', 'priceDeterminationMethod', 'oversubscriptionDesc', 'natureOfAmendment', 'legalStatusOtherDesc','jurisdictionOrganization',
       'city','zipCode', 'issuerWebsite']

In [None]:
columns_to_drop= ['compensationAmount', 'financialInterest','securityOfferedOtherDesc', 'priceDeterminationMethod', 'oversubscriptionDesc', 'natureOfAmendment', 'legalStatusOtherDesc'
,'jurisdictionOrganization','city','zipCode', 'issuerWebsite']
df_clean= df.drop(columns=columns_to_drop, axis=1)


In [None]:
df_clean[df_clean['isAmendment']== 0] #has only 2 values 0 and 1 and no null values
