### User defined functions for the Spark case study

#### 1. Impute Missing Values

In [1]:
def impute_numeric_values(dataframe,column,approach):
    '''impute_values function will impute the missing data(numeric) in the dataframe column.
        dataframe :: Dataframe Name
        column    :: Column name(numeric)
        approach  :: 1,2,3 (default value 1)
            1. Delete the row
            2. Replace missing value with mean value.
            3. Replace missing value with 0
    '''
    dataframe_filtered = dataframe.copy()
    if approach == 1:
        dataframe_filtered.drop(dataframe_filtered[dataframe_filtered[column].isnull()].index,inplace=True)
    if approach == 2:
        dataframe_filtered[column].fillna(dataframe[column].mean(),inplace =True)
    if approach == 3:
        dataframe_filtered[column].fillna(0,inplace =True)        
        
    print('\nNumber of Records (after imputing missing values): ',dataframe_filtered.count()[0])
    return dataframe_filtered

#### 2. Define Function to to wite(.csv) a file to the current working directory.

In [7]:
# optional - if you want to store the dataframe so that it can be used with external applications such as Tableau
def writeDF(dataframe,outFileName):
    '''
    dataframe : Dataframe name
    Pass the dataframe name & outFileName, this function will create a file 'outFileName.csv' at current working directory.
    '''
    dataframe.to_csv(path_or_buf = './'+outFileName+'.csv')
    size = str(round((os.path.getsize('./'+outFileName+'.csv')/1000),1))
    print('File:'+ outFileName+'.csv created at current working directory \nSize of file :\t'+size+ ' KB' )

#### 3. Define Function to get the avg_funding_amount for any Funding type

In [2]:
def avg_funding_amount(funding_type):
    return round(master_frame_imputed[master_frame_imputed['funding_round_type']==funding_type]['raised_amount_usd'].mean(),2)

#### 4. Defining function demap, which will convert data in scattered crosstab input to two columns output.

In [5]:
def demap(df,outColName):
    '''
    df          :: datframe
    outColName  :: Output column name.
    '''
    l = list()
    c = int(df.count().sort_values(ascending=False)[0])
    for i in range(c):
        for col_name in df.columns:
            if df[col_name][i]==1:
               l.append(col_name)
    df2 = df.copy()
    df2[outColName] = l
    return df2.iloc[:,[0,-1]]

#### 5. Define function to get top sectors count and name

In [6]:
def getSector(dataset,n,identifier):
    '''
    dataset     :: datframe
    n           :: n represents nth order, 0 for top, 1 for second top and so on.
    identifier  :: identifier = 'count' to get the count, identifier = 'name' to get the sector name.
    '''
    if identifier == 'name':
        return (dataset.groupby(by='main_sector')['main_sector'].count().sort_values(ascending=False).index[n])
    elif identifier == 'count':
        return (dataset.groupby(by='main_sector')['main_sector'].count().sort_values(ascending=False)[n])

<font color='blue'><b>UDF to get top sectors count and name</b?</font>

getSector(dataset,n,identifier) will give you 'name of the sector' or 'count of investments in sectors' based on the parameter value.

In [2]:
#dynamic = False

# dynamic = True
# Note for the English Speaking Countries if you want to dynamically get the information from MySQL > MySQl > world database > 
# country & country language table.
# Run command as mentioned in prerequisite : conda install -c anaconda mysql-connector-python¶
# Run this sql on world db > INSERT INTO COUNTRYLANGUAGE VALUES('IND','English','T',4.5)

# dynamic = False
# Manually a column will be added(by referring pdf) in top9 dataframe called IsOfficialEN

# Now the requirement is to fetch top 3 countries from top 9 dataframe where official language is English, we will fetch this 
# information from MySQl > world database > country & country language table. Alternatively we can check this information

#if dynamic:
#    import mysql.connector as con
#    db_connection = con.connect(host='localhost', database='world', user='root', password='mysql@123')
#    query = 'select code,name from country where code in(SELECT countrycode FROM COUNTRYLANGUAGE WHERE LANGUAGE=%s AND ISOFFICIAL=%s)'
#    countryEN =  pd.read_sql(sql =query,con =db_connection,params=['English','T'])
#    top9['IsOfficialEN'] = top9.country_code.isin(countryEN['code'])
#else: