In [None]:
import time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


CITY_DATA = { 'chicago': 'chicago.csv',
              'new york city': 'new_york_city.csv',
              'washington': 'washington.csv' }


def get_filters():
    """
    Asks user to specify a city, month, and day to analyze.

    Returns:
        (str) city - name of the city to analyze
        (str) month - name of the month to filter by, or "all" to apply no month filter
        (str) day - name of the day of week to filter by, or "all" to apply no day filter
    """
    print('Hello! Let\'s explore some US bikeshare data!')
    # get user input for city (chicago, new york city, washington). HINT: Use a while loop to handle invalid inputs


    # get user input for month (all, january, february, ... , june)


    # get user input for day of week (all, monday, tuesday, ... sunday)
    def handle_invalid_user_input():
        print ("Your input is invalid. Please try again.")

    # get user input for city (chicago, new york city, washington). HINT: Use a while loop to handle invalid inputs
    while True:
        city = input("\nWould you like to see data for Chicago, New York City, or Washington ?\n").lower()
        if city in ['chicago', 'new york city', 'washington']:
            break
        else:
            handle_invalid_user_input()

    # get user input for month (all, january, february, ... , june)
    while True:
        month = input("\nWhich month ? January, February, March, April, May, June or All? \n").lower()
        if month in ["january", "february", "march", "april", "may", "june", "all"]:
            break
        else:
            handle_invalid_user_input()

    # get user input for day of week (all, monday, tuesday, ... sunday)
    while True:
        day = input("\nWhich day ? Monday, Tuesday, Wednesday, Thursday, Friday, Saturday, Sunday or All?\n").lower()
        if day in ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "all"]:
            break
        else:
            handle_invalid_user_input()



    print('-'*40)
    return city, month, day

def load_data(city, month, day):
    """
    Loads data for the specified city and filters by month and day if applicable.
    Args:
        (str) city - name of the city to analyze
        (str) month - name of the month to filter by, or "all" to apply no month filter
        (str) day - name of the day of week to filter by, or "all" to apply no day filter
    Returns:
        df - pandas DataFrame containing city data filtered by month and day
    """
    try:
        # load data file into a dataframe and parse the 'Start Time' column.
        df = pd.read_csv(CITY_DATA[city], parse_dates=['Start Time'])

        # extract month and day of week from Start Time to create new columns
        df['month'] = df['Start Time'].dt.month
        df['day_of_week'] = df['Start Time'].dt.dayofweek

        # filter by month if applicable
        if month != 'all':
            # use the index of the months list to get the corresponding int
            months = ['january', 'february', 'march', 'april', 'may', 'june']
            month = months.index(month) + 1

            # filter by month to create the new dataframe
            df = df[df['month'] == month]
        # filter by day of week if applicable
        if day != 'all':
            # use the index of the days list to get the corresponding int
            days = ['sunday', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday','saturday']
            day = days.index(day)
            # filter by day of week to create the new dataframe
            df = df[df['day_of_week'] == day]

        return df

    except ValueError as e:
        print(e.args)
        
def time_stats(df):
    """Displays statistics on the most frequent times of travel."""

    print('\nCalculating The Most Frequent Times of Travel...\n')
    start_time = time.time()

    # display the most common month
    most_common_month = df['month'].mode()[0]
    print ('The most common month is {}'. format(most_common_month))

    # display the most common day of week
    most_common_day_of_week = df['day_of_week'].mode()[0]
    print ('The most common day of week is {}'. format(most_common_day_of_week))

    # display the most common start hour
    df['hour'] = df['Start Time'].dt.hour
    most_common_start_hour = df['hour'].mode()[0]
    print ('The most common start hour is {}'. format(most_common_start_hour))

    print("\nThis took %s seconds." % (time.time() - start_time))
    print('-'*40)



def station_stats(df):
    """Displays statistics on the most popular stations and trip."""

    print('\nCalculating The Most Popular Stations and Trip...\n')
    start_time = time.time()

    # display most commonly used start station
    
    mostcommonly_startstation = df['Start Station'].mode()[0]
    print ('The most commonly used start station is : {}'.format(mostcommonly_startstation))
    
    
    # display most commonly used end station
    mostcommonly_endstation = df['End Station'].mode()[0]
    print ('The most commonly used end station is : {}'.format(mostcommonly_endstation))
    

    # display most frequent combination of start station and end station trip
    mostfreq = df[['Start Station', 'End Station']].mode()
    print ('The most frequent combination of start station and end station trip is :\n{}'.format(mostfreq))
    

    print("\nThis took %s seconds." % (time.time() - start_time))
    print('-'*40)

def trip_duration_stats(df):
    """Displays statistics on the total and average trip duration."""

    print('\nCalculating Trip Duration...\n')
    start_time = time.time()

    # TO DO: display total travel time

    total_travel_sec= df['Trip Duration'].sum()
    total_travel_min= total_travel_sec // 60
    total_travel_hours= total_travel_min // 60
    print ('Total Travel Time (hours): ',total_travel_hours)

    # TO DO: display mean travel time

    total_travel_sec_avg= df['Trip Duration'].mean()
    total_travel_min_avg= total_travel_sec_avg // 60

    print ('Mean Travel Time (minutes): ',total_travel_min_avg)

    print("\nThis took %s seconds." % (time.time() - start_time))
    print('-'*40)    

def user_stats(df):
    """Displays statistics on bikeshare users."""

    print('\nCalculating User Stats...\n')
    start_time = time.time()

    # Display counts of user types
    usertype_counts=df['User Type'].value_counts()
    print ("The Number of User Types is: \n{0}".format(usertype_counts))

    # Display counts of gender
    #Not all dataframes have the column gender (e.g. The washington.csv file does not have the column Gender)
    
    try:
        gender_counts=df['Gender'].value_counts()
        print ("The Number of Gender is:\n{0}".format(gender_counts))
        
    except KeyError:
        print ("This dataframe does not have the Gender Column")
        
    # Display earliest, most recent, and most common year of birth
    
    #Similarly not all dataframes have the Birth Year Column
    
    try:
        early_birthyear=df['Birth Year'].min()
        print ('The Earliest Birth Year is : {}'.format(int(early_birthyear)))
        
        mostrecent_birthyear=df['Birth Year'].max()
        print ("The Most Recent Birth Year is : {}".format(int(mostrecent_birthyear)))
        
        mostcommon_birthyear=df['Birth Year'].mode()[0]
        print ("The Most Common Birth Year is : {}".format(int(mostcommon_birthyear)))
    
    except KeyError:
        print ("This dataframe does not have the Birth Year Column")

    print("\nThis took %s seconds." % (time.time() - start_time))
    print('-'*40)

def EDA_UserType(df):
    df['User Type'].value_counts().plot.pie(colors = ['green', 'black'])

    plt.title('User Type', fontweight = 30, fontsize = 20)
    plt.xlabel('User Type')
    plt.ylabel('count')
    plt.show()

def EDA_Gender(df):
    try:
        df['Gender'].value_counts().plot.pie(colors=['black','yellow'])
        plt.title('User Type', fontweight = 30, fontsize = 20)
        plt.xlabel('User Type')
        plt.ylabel('count')
        plt.show()
    
    except KeyError:
        print ()
    
def display_raw_data(df):
    
        
    print('Do want to see the raw data (y/n)')
        
    if input().lower() == 'n':
        return
    print('What dataset do you seek?\\nOptions include "Chicago", "New York City" ,"Washington\"'),
    user_choice = input()
   
    file = None

    if user_choice.lower() == 'chicago':
        file = CITY_DATA['chicago']
          
    elif user_choice.lower() == 'new york city':
        file = CITY_DATA['new york city']
            
    elif user_choice.lower() == 'washington':    
        file = CITY_DATA['washington']
    else:
        print('error')
     
    skipped_rows = 0
    continue_ = True

    while continue_ and file != None:
       
        df = pd.read_csv(file, skiprows = skipped_rows, na_values = ['no info', '.'])
        print(df.head(5))
     
        print('\\nMore data? (y/n)')
        continue_ = True if input() == 'y' else False

        skipped_rows += 5


def main():
    while True:
        city, month, day = get_filters()
        df = load_data(city, month, day)
        
        time_stats(df)
        station_stats(df)
        trip_duration_stats(df)
        user_stats(df)
        EDA_UserType (df)
        EDA_Gender (df)
        display_raw_data(df)
        

        restart = input('\nWould you like to restart? Enter yes or no.\n')
        if restart.lower() != 'yes':
            break


if __name__ == "__main__":
    main()


Hello! Let's explore some US bikeshare data!
