General Stuff:

Use ctrl + enter to run cells<br>
Variables save between cells<br>
If you don't have an = statement, the bottom line will print, really nice for printing dataframes

In [None]:
#basic python stuff

x = 10 #declares a variable, no need for types
my_data = dict() #if I specifically want to include some type

print('if statement')
if x==10:
    print(10)

print('\nrange for loop')
for i in range(3,x,2): #first is starting point, middle is ending point (not inclusive), last is step size
    print(i)

print('\nlist for loop')
stuff = ['hi', 'I', 'love', 'KTP']
for thing in stuff:
    print(thing)

print('\nlist for dictionary')

my_data['t-shirts'] = 10
my_data['socks'] = 15
my_data['underwear'] = 15
my_data['pants'] = 5
my_data['hoddies'] = 3

for key, data in my_data.items():
    print(f'{key}: {data}')

print('\naccessing one value in a dictionary')
print(f"printing the number of socks: {my_data['socks']}")

In [None]:
import pandas as pd

owners_df = pd.read_csv('practice_data/owners.csv')
pets_df = pd.read_csv('practice_data/pets.csv')

In [None]:
owners_df

In [None]:
pets_df

In [None]:
pets_df.head()  # First 5 rows
pets_df.info()  # Column types and null values
pets_df.describe()  # Summary stats for numerical columns
pets_df.columns  # List of column names


In [None]:
#accessing columns in data frames
pets_df['Name']

In [None]:
pets_df.isnull().sum()  # Count missing values per column
pets_df.dropna(inplace=True)  # Remove rows with missing data
#pets_df.fillna({'Name': pets_df['Name'].median()}, inplace=True)  # Replace NaN in 'Name' with median value


In [None]:
#understanding the data
#we can just look at the data with something like
pets_df['Age'].value_counts() # gives counts of pet ages by most to least values 
pets_df['Age'].value_counts().sort_index() # gives counts of pet ages by ages ascending 
pets_df['Age'].value_counts().sort_index().reset_index()
# sorts rows based on index labels, then resets the index --> index is now a column and creates a new default integer index 

In [None]:
#or we can make simple graphs to view distribution better
import plotly.express as px
import matplotlib.pyplot as plt 
import seaborn as sns
import pandas as pd
import numpy as np

# Create bar chart
age_counts= pets_df['Age'].value_counts().sort_index().reset_index()

fig = px.bar(
    age_counts,
    x='Age',
    y='count',
    text='count',  # Display count values on bars
    title="Distribution of Pet Ages",
    labels={'Age': 'Pet Age', 'count': 'Number of Pets'},  # Axis labels
    color='count',  # Color bars based on count for better visualization
    color_continuous_scale='Blues'  # Aesthetic color scale
)

# Show figure
fig.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Count occurrences of each pet age
age_counts = pets_df['Age'].value_counts().sort_index()

# Create bar chart
plt.figure(figsize=(8, 6))
sns.barplot(x=age_counts.index, y=age_counts.values, palette="Blues")
plt.xlabel("Pet Age")
plt.ylabel("Number of Pets")
plt.title("Distribution of Pet Ages")
plt.xticks(rotation=45)
plt.show()


In [None]:
# Create a histogram of pet ages
plt.figure(figsize=(8, 6))
sns.histplot(pets_df['Age'], bins=10, kde=True, color='blue')
plt.title("Distribution of Pet Ages")
plt.xlabel("Age")
plt.ylabel("Number of Pets")
plt.show()

In [None]:
#we can filter our data
print(pets_df[(pets_df['Kind'] == 'Dog') & (pets_df['Age'] > 10)].shape[0]) #prints number of dogs above the age of 10
pets_df[(pets_df['Kind'] == 'Dog') & (pets_df['Age'] > 10)].sort_values('Name')


In [None]:
#this has a few things happening here
#first it groups by the type of animal (so cat, dog, and parrot)
#it then uses .agg to perform functions on the data
    #it gets the max of the age
    #it gets the mode of the gender
        #this uses a lambda function which is just a simple way to perform code on a given value (which in this case is a row)

pets_df.groupby('Kind').agg({'Age': 'max', 'Gender': lambda x: x.mode()[0]})
#pets_df.groupby('genre')['tempo'] # selects tempo column from genre (grouped object) --> prepares for aggregation 

In [None]:
#We can also aggregate data using merge
merged = pets_df.merge(owners_df, how='inner', on='OwnerID')
merged

In [None]:
#we can see this presented an issue with the names of the columns so we will fix that
merged = merged.rename(columns={'Name_x' : 'Pet_Name', 'Name_y': 'Owner_Name'})
merged[['Pet_Name', 'Owner_Name']] #filtering again but in this case just to display two columns

In [None]:
pets_df.groupby('genre')['tempo'].mean() 
pets_df.loc[pets_df.groupby('genre')['popularity'].idxmax()] # most popular song per genre 


In [None]:
#making new column 
pets_df['hype_score'] = (pets_df['energy'] * 0.4) + (pets_df['loudness'].abs() * 0.3) + (pets_df['danceability'] * 0.3)


Other functions to look into <br>

Pivoting: good for when you want to reformat your data <br>

Different Types of Graphs: <br>
    bar graphs <br>
    histograms <br>
    scatterplots + line of best fit <br>
    box + wisker plots <br>
    


line 

bar chart = counting frequency 
    EX: most common names, number of pets by kind
histogram = distribution of numerical data 
    EX: most pets age 
pie chart = proportions 
    EX: percentage of each kind 
box plot = compare distributions and outliers 
    EX: compare pet ages for different kinds 
scatter plot = relationships/correlations between 2 numeric variables 
    EX: owner id vs pet age
line chart = track changes over time 
    EX: how many pets exist at each age 

**YOUR TASKS** <br>

using the merged dataframe, create some sort of graph of your chosing on a column <br>

find the most common name of the owners and do the same for the pets <br>

filter the data frame for only owners who are from ['Ann Arbor', 'Grand Rapids', 'Detroit', 'Livonia'] and have a surname that starts with a letter in the first half of the alphabet