# Example 1: convert string to `datetime` object

This following functions converts any string in the form `YYYY-MM-DD` to a datetime `datetime` variable.


In [70]:
def transform_string_to_date(date_string):
        '''
        input:
            date_string - str, represents a date as 'YYYY-MM-DD'
        output:
            date - datetime.date 
        '''
        date = None 
        from datetime import datetime
        date = datetime.strptime(date_string, '%Y-%m-%d').date()
        return date 
    
# test function      
date_string = '2013-09-12'
date = transform_string_to_date(date_string)
print(date)
print(type(date))

2013-09-12
<class 'datetime.date'>


# Example 2: calculate stock gain 

The following function calculates the stock gain from 9-1-2000 Open price to 3-7-2008 close price for 10 shares of stock and returns a float which is the amount of gain. 


In [21]:
import pandas as pd
aapl_df = pd.read_csv('./aapl_stock_price.csv')
aapl_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adjusted Close
0,2015-08-24,94.870003,103.449997,92.0,103.120003,161454200.0,103.120003
1,2015-08-21,110.43,111.900002,105.650002,105.760002,126289200.0,105.760002
2,2015-08-20,114.080002,114.349998,111.629997,112.650002,67765500.0,112.650002
3,2015-08-19,116.099998,116.519997,114.68,115.010002,47445700.0,115.010002
4,2015-08-18,116.43,117.440002,116.010002,116.5,34461400.0,116.5


In [167]:
# Method 1: not using pandas 
def calculate_eight_year_gain(fname):
        '''
        Calculates the stock gain from 9-1-2000 Open price to 3-7-2008 close price for
        10 shares of stock
        input:
            fname - str, aapl stock price filename
        output:
            gain - float, monetary gain during period
        '''
        gain = None 
        with open(fname, "r") as stock:
            stock_lines = stock.readlines()
            
        for i,line in enumerate(stock_lines):
            share_num = 10
            if '2000-09-01' in line:
                purchased_list = line.strip().split(',')
                opening_price = purchased_list[1]
            elif '2008-03-07' in line:
                sold_list = line.strip().split(',')
                closing_price = sold_list[4]

        gain = (float(closing_price)-float(opening_price))*share_num  
        
        return gain 
        
fname = './aapl_stock_price.csv'
calculate_eight_year_gain(fname)

609.37596

In [194]:
# Method 2: using pandas
def calculate_eight_year_gain(fname):
        '''
        Calculates the stock gain from 9-1-2000 Open price to 3-7-2008 close price for
        10 shares of stock
        input:
            fname - str, aapl stock price filename
        output:
            gain - float, monetary gain during period
        '''
        
        import pandas as pd
        gain = None 
        aapl_df = pd.read_csv(fname)
        purchased_df = aapl_df[aapl_df.Date == '2000-09-01']
        sold_df = aapl_df[aapl_df.Date == '2008-03-07']
        gain = ( -float(purchased_df['Open'])+ float(sold_df['Close']) )*10
        return gain 
        
fname = './aapl_stock_price.csv'
calculate_eight_year_gain(fname)

609.37596

# Example 3: count the number of trading dates 

The following function counts the number of dates AAPL was traded between two dates (both dates inclusive). 


In [202]:
# Method 1: not using pandas 
def count_number_of_trading_days(fname, start, stop):
    '''
    Counts the number of trading days between 1988-06-06 and 1999-12-07 (inclusive)
    input:
        start - str, start date in string format
        stop - str, stop date in string format
    output:
        trading_days - int, number of trading days during the time period
    '''
    trading_dates = None 
    from datetime import datetime
    
    # convert start date and end date to daytime.date objects  
    start_date = datetime.strptime(start, "%Y-%m-%d").date()
    stop_date = datetime.strptime(stop,"%Y-%m-%d").date()
    
    with open(fname, "r") as stock:
        stock_lines = stock.readlines()
        
    trading_dates = 0
    trading_dates_list = []        
    for i,line in enumerate(stock_lines[1:]):
        trading_date = line.strip().split(',')[0]
        # convert to datetime.date objects 
        trading_date = datetime.strptime(trading_date, "%Y-%m-%d").date()
        trading_dates_list.append(trading_date)
        if trading_date <= stop_date and trading_date >= start_date:
#             print(trading_date)
            trading_dates += 1 
    
    return trading_dates


# test function 
fname = './aapl_stock_price.csv'
start =  '1988-06-06'
stop = '1999-12-07' 
count_number_of_trading_days(fname, start, stop)

2909

In [203]:
# Method 2: using pandas 
def count_number_of_trading_days(fname, start, stop):
    '''
    Counts the number of trading days between 1988-06-06 and 1999-12-07 (inclusive)
    input:
        start - str, start date in string format
        stop - str, stop date in string format
    output:
        trading_days - int, number of trading days during the time period
    '''
    trading_dates = None 
    from datetime import datetime
    import pandas as pd
        
    aapl_df = pd.read_csv(fname)
    aapl_df_select = aapl_df[(aapl_df.Date >= start) & (aapl_df.Date <= stop)]
    trading_dates = aapl_df_select['Date'].count()
    
    return trading_dates


# test function 
fname = './aapl_stock_price.csv'
start =  '1988-06-06'
stop = '1999-12-07' 
count_number_of_trading_days(fname, start, stop)

2909

# Example 4: count the number of trading days with positive gains

The following function counts the number of trading days between the start and stop dates (inclusive) where the closing price is greater than the opening price. 

In [113]:
# Method 1: not using pandas 
def number_of_days_positive_gain(fname, start, stop):
        '''
        Counts the number of trading days between the start and stop date (inclusive)
        where the close price is greater than the open price.
        input:
            start - str, start date in string format
            stop - str, stop date in string format
        output:
            positive_days - int, number of trading days during the time period
                                 where a positive gain was made
        '''
        positive_dates = None 
        from datetime import datetime
    
        # convert start date and end date to daytime.date objects  
        start_date = datetime.strptime(start, "%Y-%m-%d").date()
        stop_date = datetime.strptime(stop,"%Y-%m-%d").date()

        with open(fname, "r") as stock:
            stock_lines = stock.readlines()

        positive_days = 0
        trading_dates_list = []        
        for i,line in enumerate(stock_lines[1:]):
            stock_list = line.strip().split(',')
            trading_date = stock_list[0]
            # convert to datetime.date objects 
            trading_date = datetime.strptime(trading_date, "%Y-%m-%d").date()
            trading_dates_list.append(trading_date)
            if trading_date <= stop_date and trading_date >= start_date:
                stock_list = line.strip().split(',')
                opening_price = float(stock_list[1])
                closing_price = float(stock_list[4])
                if closing_price > opening_price:
                    positive_days += 1 
    
        return positive_days


# test function 
fname = './aapl_stock_price.csv'
start =  '1988-06-06'
stop = '1999-12-07' 
number_of_days_positive_gain(fname, start, stop)

1322

In [211]:
# Method 2: using pandas 
def number_of_days_positive_gain(fname, start, stop):
        '''
        Counts the number of trading days between the start and stop date (inclusive)
        where the close price is greater than the open price.
        input:
            start - str, start date in string format
            stop - str, stop date in string format
        output:
            positive_days - int, number of trading days during the time period
                                 where a positive gain was made
        '''
        positive_days = None 
        
        import pandas as pd 
        aapl_df = pd.read_csv(fname)
        aapl_df_select = aapl_df[(aapl_df.Date >= start) & (aapl_df.Date <= stop) & (aapl_df['Close']>aapl_df['Open'])]
        positive_days = aapl_df_select['Date'].count()
    

        return positive_days


# test function 
fname = './aapl_stock_price.csv'
start =  '1988-06-06'
stop = '1999-12-07' 
positive_days = number_of_days_positive_gain(fname, start, stop)
print(positive_days)

1322


# Example 5: largest sample size

Now we work on data on the college majors. The following functions identifies the major with the large sample size from the survey. 

In [22]:
import pandas as pd
aapl_df = pd.read_csv('./grad-students.csv')
aapl_df.head()

Unnamed: 0,Major_code,Major,Major_category,Grad_total,Grad_sample_size,Grad_employed,Grad_full_time_year_round,Grad_unemployed,Grad_unemployment_rate,Grad_median,...,Nongrad_total,Nongrad_employed,Nongrad_full_time_year_round,Nongrad_unemployed,Nongrad_unemployment_rate,Nongrad_median,Nongrad_P25,Nongrad_P75,Grad_share,Grad_premium
0,5601,CONSTRUCTION SERVICES,Industrial Arts & Consumer Services,9173,200,7098,6511,681,0.087543,75000.0,...,86062,73607,62435,3928,0.050661,65000.0,47000,98000.0,0.09632,0.153846
1,6004,COMMERCIAL ART AND GRAPHIC DESIGN,Arts,53864,882,40492,29553,2482,0.057756,60000.0,...,461977,347166,250596,25484,0.068386,48000.0,34000,71000.0,0.10442,0.25
2,6211,HOSPITALITY MANAGEMENT,Business,24417,437,18368,14784,1465,0.073867,65000.0,...,179335,145597,113579,7409,0.048423,50000.0,35000,75000.0,0.119837,0.3
3,2201,COSMETOLOGY SERVICES AND CULINARY ARTS,Industrial Arts & Consumer Services,5411,72,3590,2701,316,0.080901,47000.0,...,37575,29738,23249,1661,0.0529,41600.0,29000,60000.0,0.125878,0.129808
4,2001,COMMUNICATION TECHNOLOGIES,Computers & Mathematics,9109,171,7512,5622,466,0.058411,57000.0,...,53819,43163,34231,3389,0.0728,52000.0,36000,78000.0,0.144753,0.096154


In [9]:
def largest_sample_size(fname):
        '''
        Identifies the major with the largest sample size from the survey
        input:
            fname - str, the filename of the grad students csv file
        output:
            major - str, the major with the largest sample size
        '''
        import pandas as pd
        df = pd.read_csv(fname)

        sample_idxmax = df.Grad_sample_size.idxmax()
        major = df['Major'][sample_idxmax]
    
        return major
        

In [10]:
fname = './grad-students.csv'
largest_sample_size(fname)

'BIOLOGY'

# Example 6: calculate unemployment number

In a future far, far away (January 2017) the markets have cratered, upending the United States economy. In this brave new future, manufacturing, business, and commerce are 95% unemployed. All of our attentation has turned to individuals in the Arts (Major_category of Arts), which are now in high demand and only 5% are unemployed.

How many of the individuals in the graduate student dataset will be unemployed in 2016?

In [19]:
def brave_new_future_unemployment(fname):
    '''
    What is the number of unemployed individuals when majors in the `Arts` are at 5% unemployment and
    all other majors are at 95% unemployment
    input:
        fname - str, filepath to the graduate students file
    output:
        unemployed - int, the number of unemployed individuals
    '''
    unemployed = None 
    # import libraries 
    import pandas as pd
    df = pd.read_csv(fname)
    df_arts = df[df['Major_category']=='Arts']
    df_not_arts = df[df['Major_category']!='Arts']
    arts_total = df_arts['Grad_total'].sum() + df_arts['Nongrad_total'].sum()
    not_arts_total = df_not_arts['Grad_total'].sum() + df_not_arts['Nongrad_total'].sum()
    unemployed = round(arts_total*0.5 + not_arts_total*0.95)
    return unemployed

fname = './grad-students.csv'
unemployed = brave_new_future_unemployment(fname)
print(unemployed)

55265108


# Example 7: 

The following functions calculates the number of majors that have more non grads employed than grads. 

In [166]:
def more_nongrad_employed(fname):
        '''
        How many majors have more non grads employed than grads?
        input:
            fname - str, filename
        output:
            num_nongrad_majors - int, number of majors with more non-graduates employed than graduates.
        '''
        import pandas as pd
        df = pd.read_csv(fname)
        df['Grad_employed']
        df2 = df[df['Nongrad_employed']>df['Grad_employed']]
        num_nongrad_majors = df2['Major'].count()
        

        return num_nongrad_majors 
    
    
    
fname = './grad-students.csv'
num_nongrad_majors  = more_nongrad_employed(fname)
print(num_nongrad_majors)

124


In [130]:
# read data frame 
data_path = './grad-students.csv'
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,Major_code,Major,Major_category,Grad_total,Grad_sample_size,Grad_employed,Grad_full_time_year_round,Grad_unemployed,Grad_unemployment_rate,Grad_median,...,Nongrad_total,Nongrad_employed,Nongrad_full_time_year_round,Nongrad_unemployed,Nongrad_unemployment_rate,Nongrad_median,Nongrad_P25,Nongrad_P75,Grad_share,Grad_premium
0,5601,CONSTRUCTION SERVICES,Industrial Arts & Consumer Services,9173,200,7098,6511,681,0.087543,75000.0,...,86062,73607,62435,3928,0.050661,65000.0,47000,98000.0,0.09632,0.153846
1,6004,COMMERCIAL ART AND GRAPHIC DESIGN,Arts,53864,882,40492,29553,2482,0.057756,60000.0,...,461977,347166,250596,25484,0.068386,48000.0,34000,71000.0,0.10442,0.25
2,6211,HOSPITALITY MANAGEMENT,Business,24417,437,18368,14784,1465,0.073867,65000.0,...,179335,145597,113579,7409,0.048423,50000.0,35000,75000.0,0.119837,0.3
3,2201,COSMETOLOGY SERVICES AND CULINARY ARTS,Industrial Arts & Consumer Services,5411,72,3590,2701,316,0.080901,47000.0,...,37575,29738,23249,1661,0.0529,41600.0,29000,60000.0,0.125878,0.129808
4,2001,COMMUNICATION TECHNOLOGIES,Computers & Mathematics,9109,171,7512,5622,466,0.058411,57000.0,...,53819,43163,34231,3389,0.0728,52000.0,36000,78000.0,0.144753,0.096154


In [163]:
df['Grad_employed']
df2 = df[df['Nongrad_employed']>df['Grad_employed']]
df2['Major'].count()

124