# Pandas 1

## Name Dallin Stewart

## Class ACME 002

## Date A long time ago

In [1]:
import numpy as np
import pandas as pd

# Problem 1

In [None]:
# Prob 1
def prob1(file='budget.csv'):
    """"
    Read in budget.csv as a DataFrame with the index as column 0 and perform each of these operations on the DataFrame in order. 
    
    1) Reindex the columns such that amount spent on groceries is the first column and all other columns maintain the same ordering.
    2) Sort the DataFrame in descending order based on how much money was spent on Groceries.
    3) Reset all values in the 'Rent' column to 800.0.
    4) Reset all values in the first 5 data points to 0.0
    
    Return the values of the updated DataFrame as a NumPy array.
    
    Parameters:
        file (str): name of datafile
        
    Return:
        values (ndarray): values of DataFrame
    """
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file, index_col=0)

    # 1) Reindex the columns such that 'Groceries' comes first
    cols = ['Groceries'] + [col for col in df.columns if col != 'Groceries']
    df = df[cols]

    # 2) Sort the DataFrame in descending order based on 'Groceries' column
    df.sort_values(by='Groceries', ascending=False, inplace=True)

    # 3) Reset all values in the 'Rent' column to 800.0
    df['Rent'] = 800.0

    # 4) Reset all values in the first 5 data points to 0.0
    df.iloc[:5] = 0.0

    return df.values

In [None]:
prob1()

# Problem 2

In [None]:
# Prob 2
def prob2(file='budget.csv'):
    """
    Read in file as DataFrame.
    Fill all NaN values with 0.0.
    Create two new columns, 'Living Expenses' and 'Other'. 
    Sum the columns 'Rent', 'Groceries', 'Gas' and 'Utilities' and set it as the value of 'Living Expenses'.
    Sum the columns 'Dining Out', 'Out With Friends' and 'Netflix' and set as the value of 'Other'.
    Identify which column, other than 'Living Expenses' correlates most with 'Living Expenses'
    and which column other than 'Other' correlates most with 'Other'.

    Return the names of each of those columns as a tuple.
    The first should be of the column corresponding to \li{'Living Expenses'} and the second to \li{'Other'}.
    
    Parameters:
        file (str): name of datafile
        
    Return:
        values (tuple): (name of column that most relates to Living Expenses, name of column that most relates to Other)
    """
    # Read the CSV file into a DataFrame and fill NaN values with 0.0
    df = pd.read_csv(file)
    df = df.fillna(0.0)

    # Create the 'Living Expenses' column by summing specified columns
    df['Living Expenses'] = df[['Rent', 'Groceries', 'Gas', 'Utilities']].sum(axis=1)

    # Create the 'Other' column by summing specified columns
    df['Other'] = df[['Dining Out', 'Out With Friends', 'Netflix']].sum(axis=1)
    
    correlations = df.corr()

    # Calculate correlations of all columns with 'Living Expenses' and 'Other' columns
    correlations_expenses = correlations['Living Expenses'].drop(['Living Expenses', 'Other'])
    correlations_other = correlations['Other'].drop(['Living Expenses', 'Other'])

    # Find the column with the highest correlation with 'Living Expenses' and 'Other'
    most_expenses = correlations_expenses.idxmax()
    most_other = correlations_other.idxmax()

    return (most_expenses, most_other)

In [None]:
prob2()

# Problem 3

In [2]:
def prob3(file='crime_data.csv'):
    """
    Read in crime data and use pandas to answer the following questions.
    
    Set the index as the column 'Year', and return the answers to each question as a tuple.
    
    1) Identify the three crimes that have a mean over 1,500,000. 
    Of these three crimes, which two are very correlated? 
    Which of these two crimes has a greater maximum value?
    Save the title of this column as a variable to return as the answer.
    
    2) Examine the data since 2000.
    Sort this data (in ascending order) according to number of murders.
    Find the years where Aggravated Assault is greater than 850,000.
    Save the indices (the years) of the masked and reordered DataFrame as a NumPy array to return as the answer.
    
    3) What year had the highest crime rate? 
    In this year, which crime was committed the most? 
    What percentage of the total crime that year was it? 
    Save this value as a float.
    
    
    Parameters:
        file (str): data
    
    Return:
        ans_1 (string): answer to Question 1
        ans_2 (ndarray): answer to Question 2
        ans_3 (float): answer to Question 3
    """
    # read the csv file
    df = pd.read_csv(file, index_col='Year')

    # PART 1
    # identify the three crimes
    mean_over = df.mean() > 1_500_000
    crimes_over = mean_over[mean_over][2:].index.tolist()
    print(crimes_over)

    # calculate correlations between crimes
    correlations = df[crimes_over].corr()

    # find the two most correlated crimes
    most_corr = np.unravel_index(np.argmax(correlations.values - np.eye(correlations.shape[0])), correlations.shape)
    corr_crime_1, corr_crime_2 = crimes_over[most_corr[0]], crimes_over[most_corr[1]]

    # determine which of the two correlated crimes has a greater maximum value
    max_crime1 = df[corr_crime_1].max()
    max_crime2 = df[corr_crime_2].max()
    most_maxed = corr_crime_1 if max_crime1 > max_crime2 else corr_crime_2


    # PART 2
    # Sort by number of murders in ascending order
    df_since_2000 = df[df.index >= 2000]
    df_sorted = df_since_2000.sort_values(by='Murder')

    # Find years where Aggravated Assault is greater than 850,000
    years = df_sorted[df_sorted['Aggravated Assault'] > 850_000].index.values

    
    # PART 3
    # find the year with highest count of crimes
    df['Crime Rate'] = df['Total'] / df['Population']
    biggest_year = df['Crime Rate'].idxmax()
    
    # sort this year ignoring Population and Total then get the index
    worst = df.loc[biggest_year].sort_values(ascending=False)[2:].idxmax()
    
    # compute percentage and save as a float
    percent_crime = float(df.loc[biggest_year][worst] / df.loc[biggest_year]['Total'])

    return most_maxed, years, percent_crime


In [3]:
prob3()

['Property', 'Burglary', 'Larceny']


('Property',
 array([2000, 2001, 2002, 2003, 2005, 2007, 2006], dtype=int64),
 0.8997188308734142)

# Problem 4

In [None]:
def prob4(file='DJIA.csv'):
    """

    Read the data with a DatetimeIndex as the index.
    Drop rows any rows without numerical values, cast the "VALUE" column to floats, then return the updated DataFrame.

    Parameters:
        file (str): data file
    Returns:
        df (DataFrame): updated DataFrame of stock market data
    """
    # read the csv file
    df = pd.read_csv(file, dtype={'VALUE':np.float64}, na_values='.')

    # remove rows with non-numeric values in "VALUE" 
    df.set_index(pd.to_datetime(df['DATE']), inplace=True)
    df.drop(columns='DATE', inplace=True)

    return df

In [None]:
prob4()

# Problem 5

In [None]:
def prob5(file='paychecks.csv'):
    """
    Create data_range for index of paycheck data.

    Parameters:
        file (str): data file
    Returns:
        df (DataFrame): DataFrame of paycheck data
    """
    # read in the fil
    df = pd.read_csv(file, names=['Pay Amount'])
    
    # create datetime index for March 13th, 2008
    ind_df = pd.date_range(start='3/13/2008', periods=93, freq='2W-FRI')
    
    # set the index column to be datetime index
    df.set_index(ind_df, inplace=True)
    df.index.rename('Date', inplace=True)
    
    return df

In [None]:
prob5()

# Problem 6

In [None]:
def prob6(file='DJIA.csv'):
    """
    Compute the following information about the DJIA dataset
    1. The single day with the largest gain
    2. The single day with the largest loss

    Parameters:
        file (str): data file
    Returns:
        max_day (<M8[ns]): DateTimeIndex of maximum change
        min_day (<M8[ns]): DateTimeIndex of minimum change
    """
    # use prob4 to load cleaned file data
    df = prob4(file)
    
    # obtain shifted dates
    shifts = df - df.shift(1)
    
    # get days for maxes and mins
    ind_max = shifts.idxmax()
    ind_min = shifts.idxmin()

    return ind_max[0], ind_min[0]

In [None]:
prob6()