In [54]:
#Import pandas and numpy with shorthand so we can use them more easily in our code.
import pandas as pd
import numpy as np

In [55]:
#Install faker library to generate fake data, which this part of the assignment calls for... commented out because this has to be done, but not in
#the notebook itself. We have to run it in the terminal.

#pip install faker pandas

In [56]:
#Import faker and random to generate fake data for our dataframe
from faker import Faker
import random

In [57]:
#Read in state population data from the csv, storing it in a dataframe called df_stateprobs
df_stateprobs = pd.read_csv('state_populations.csv')

In [58]:
#Calculate the total population across all states
total_population = df_stateprobs['Population'].sum()
#Calculate the probability of selecting a person from each state based on its population relative to the total population
df_stateprobs['Probability'] = df_stateprobs['Population'] / total_population
#Assign weights to each state based on calculated probability
weights = df_stateprobs['Probability'].values

In [59]:
#function that generates fake data for n people with specified weights
#For each person in the range, we generate first name, last name, age, us state (based on weights), and a random sentence
#Stored in a dictionary, appended to the list, and then converted to a dataframe at the end
def generate_people(n: int, weights: list) -> pd.DataFrame:
    fake = Faker()
    data = []
    for _ in range(n):
        person = {
            'first_name': fake.first_name(),
            'last_name': fake.last_name(),
            'age': np.random.randint(18, 80),
            'us_state': random.choices(df_stateprobs['State'], weights=weights, k=1)[0],
            'random_sentence': fake.sentence()
        }
        data.append(person)
    return pd.DataFrame(data)

In [60]:
#generate 200 fake people with the function from above and weights we calculated for each state
n = 200
df_people = generate_people(n, weights)

In [61]:
#make csv frokm the dataframe we generated with fake people data
df_people.to_csv('people.csv', index=False)

In [62]:
#load in csv to answer questions, storing it in a dataframe called df_people
df_people = pd.read_csv('people.csv')

In [63]:
#question 1: What is the average age in the dataset for the 10 most populous states combined?

#Identify the 10 most populous states from the df_stateprobs dataframe
populous_states = df_stateprobs.nlargest(n=10, columns=['Population'])

#Declare varables to keep track of total age and total people in the 10 most populous states
total_age = 0
total_people = 0

#For each state in the 10 most populous states, filter to get the people from those states
#Sum their ages and add to the total_age
#count the number of people and add to the total_people
for state in populous_states['State']:
    state_people = df_people[df_people['us_state'] == state]
    total_age += state_people['age'].sum()
    total_people += len(state_people)

#Calculate average age in the dataset for the 10 most populous states combined by dividing total age by total people
mean_age = total_age / total_people

#Print mean age
print("The average age in the dataset for the 10 most populous states combined is " + str(mean_age) + " years old.")
   

The average age in the dataset for the 10 most populous states combined is 52.15384615384615 years old.


In [64]:
#question 2: For the 10 most populous states, how many people in the dataset share a letter between their first and last name?

#Declare variable to keep track of total people who share a letter between their first and last name
total_shared_letter = 0

#For each state in the 10 most populous states, filter to get the people from those states
#For each person in those states, check if there is any shared letter between their first and last name by converting the names to sets and checking for intersection
#If there is a shared letter, add 1 to the total_shared_letter variable
for state in populous_states['State']:
    state_people = df_people[df_people['us_state'] == state]
    for _, person in state_people.iterrows():
        if set(person['first_name']).intersection(set(person['last_name'])):
            total_shared_letter += 1

#Print total people who share a letter between their first and last name in the 10 most populous states
print("The total number of people who share a letter between their first and last name in the 10 most populous states is " + str(total_shared_letter) + ".")


The total number of people who share a letter between their first and last name in the 10 most populous states is 87.


In [65]:
#question 3: Write a function str_to_value(s: str) -> int that converts each character of a string to its ASCII value equivalent and sums those together.

#Make function that takes in a string and converts it to its ASCII value (int)
#Make an empty list to store the ASCII values
#For each character in the string, append its ASCII value (using ord()) to the list
#Return the ASCII values in the list
def str_to_value(s: str) -> int:
    ascii_values = []
    for char in s:
        ascii_values.append(ord(char))
    return ascii_values

In [66]:
#question 4: Create a new column in your Person dataframe that applies this function to their sentence. 
#Compute the 90th percentile of this column and then print all states with a person whose sentence score is above that 90th percentile. 

#Apply the str_to_value function to the random_sentence column in the df_people dataframe and store it in a new column of df_people called ASCII_sentence_sum
#Obtain the ASCII of each string of the sentence, and sum values together to get total ASCII of the whole sentence
df_people['ASCII_sentence_sum'] = df_people['random_sentence'].apply(lambda x: sum(str_to_value(x)))

#Calculate ninety percentile of the ASCII_sentence_sum column in the df_people dataframe
ninety_percentile = df_people['ASCII_sentence_sum'].quantile(0.9)

#Filter the df_people dataframe to get the rows where the ASCII_sentence_sum is above the ninety percentile
#Print the us_state column of those rows to get the states with a person whose sentence score is above the 90th percentile
results = df_people[df_people['ASCII_sentence_sum'] > ninety_percentile]['us_state']
print(results)

2         Connecticut
23              Texas
32          Wisconsin
33           Oklahoma
35         California
42             Kansas
77               Utah
82     South Carolina
103           Florida
114        California
115        California
126     Massachusetts
128             Texas
154          Illinois
163          Virginia
167              Ohio
177      Pennsylvania
182        New Jersey
193         Minnesota
194        Washington
Name: us_state, dtype: str
