In [31]:
# Imports - avoid importing multiple copies
import pandas as pd
from decimal import Decimal
import locale

In [3]:
data = {
    "Name": ["Jack", "Jill", "Frank", "Leon"],
    "Age": [27, 40, 55, 25],
    "Career": ["IT Manager", "Scrum Master", "Legacy Developer", "Jr Software Engineer"],
    "Pay": [150000, 200000, 175000, 65000]
}

df = pd.DataFrame(data)
df = df.convert_dtypes(dtype_backend="pyarrow")

display(df)

Unnamed: 0,Name,Age,Career,Pay
0,Jack,27,IT Manager,150000
1,Jill,40,Scrum Master,200000
2,Frank,55,Legacy Developer,175000
3,Leon,25,Jr Software Engineer,65000


In [7]:
# Add a column to the dataframe
df["Country"] = ["USA", "France", "South Africa", "China"]
display(df)

Unnamed: 0,Name,Age,Career,Pay,Country
0,Jack,27,IT Manager,150000,USA
1,Jill,40,Scrum Master,200000,France
2,Frank,55,Legacy Developer,175000,South Africa
3,Leon,25,Jr Software Engineer,65000,China


In [9]:
# Remove a column from the dataframe
df.drop(["Country"], axis=1, inplace=True) #axis 0 is row, 1 is column
display(df)

Unnamed: 0,Name,Age,Career,Pay
0,Jack,27,IT Manager,150000
1,Jill,40,Scrum Master,200000
2,Frank,55,Legacy Developer,175000
3,Leon,25,Jr Software Engineer,65000


In [10]:
# Let's say the pay is in Euros; how would I add a new column where the values are pay converted to USD?
df["Pay in USD"] = df["Pay"] * 1.09
display(df)

Unnamed: 0,Name,Age,Career,Pay,Pay in USD
0,Jack,27,IT Manager,150000,163500.0
1,Jill,40,Scrum Master,200000,218000.0
2,Frank,55,Legacy Developer,175000,190750.0
3,Leon,25,Jr Software Engineer,65000,70850.0


In [11]:
# Filtering
subset = df[df["Age"] > 30]

display(subset)

Unnamed: 0,Name,Age,Career,Pay,Pay in USD
1,Jill,40,Scrum Master,200000,218000.0
2,Frank,55,Legacy Developer,175000,190750.0


In [16]:
# Change value in a column for ALL rows - same syntax as adding a column
df["Country"] = "USA"
# Change value in a column for SELECTED rows
df.loc[df["Career"] == "Scrum Master", "Country"] = "Some Other Country"
display(df)

Unnamed: 0,Name,Age,Career,Pay,Pay in USD,Country
0,Jack,27,IT Manager,150000,163500.0,USA
1,Jill,40,Scrum Master,200000,218000.0,USA
2,Frank,55,Legacy Developer,175000,190750.0,USA
3,Leon,25,Jr Software Engineer,65000,70850.0,USA


In [38]:
# In-class exercise:
# Look at the data frame generated by the following code:
url = "https://raw.githubusercontent.com/BriDeWaltCCC/PFDADataSets/main/Class6Exercise1.csv"
df = pd.read_csv(url)

# Rename bad columns
df.rename(columns={"Country": "Order Date", "City": "Order Time"}, inplace=True)

# Remove columns not needed for analysis
df.drop(["Text", "Lat Lng"], axis=1, inplace=True)

# Set pyarrow data types
df = df.convert_dtypes(dtype_backend="pyarrow")
df["Item 1 Cost"] = df["Item 1 Cost"].astype()
# Add a calculated total column
df["Total Cost"] = df[["Item 1 Cost", "Item 2 Cost", "Item 3 Cost"]].sum(axis=1)
# Add a calculated average column
df["Avg Cost"] = round(df[["Item 1 Cost", "Item 2 Cost", "Item 3 Cost"]].mean(axis=1), 2)

print(df.dtypes)
display(df)

Name           string[pyarrow]
Order Date     string[pyarrow]
Order Time     string[pyarrow]
Item 1 Cost     int64[pyarrow]
Item 2 Cost     int64[pyarrow]
Item 3 Cost     int64[pyarrow]
Total Cost      int64[pyarrow]
Avg Cost       double[pyarrow]
dtype: object


Unnamed: 0,Name,Order Date,Order Time,Item 1 Cost,Item 2 Cost,Item 3 Cost,Total Cost,Avg Cost
0,Dominic English,"Dec 28, 2023",7:03 AM,2,3,3,8,2.67
1,Jocelyn Villarreal,"Aug 1, 2023",7:51 PM,9,5,7,21,7.0
2,Guinevere Wynn,"Jan 6, 2023",2:02 PM,1,2,7,10,3.33
3,Alexander Ramos,"Nov 19, 2023",12:12 AM,7,9,5,21,7.0
4,Daryl Kane,"Mar 1, 2024",8:50 PM,4,9,9,22,7.33
...,...,...,...,...,...,...,...,...
495,Basil Lane,"Feb 29, 2024",7:43 AM,4,9,9,22,7.33
496,Vincent Lindsey,"May 13, 2024",7:32 AM,3,0,4,7,2.33
497,Evan Baldwin,"Nov 24, 2023",3:23 PM,6,7,8,21,7.0
498,Martin Strong,"Mar 17, 2023",2:47 AM,9,9,9,27,9.0


In [19]:
# Pat Lacey's version from class
url = "https://raw.githubusercontent.com/BriDeWaltCCC/PFDADataSets/main/Class6Exercise1.csv"

df = pd.read_csv(url)
# print(df.head()) - checking that the dataframe pulled the data from the url

# Drop the "Text" column because it seems to be filled with jibberish?
# not using inplace=True here to make an instance of the dataframe separate from the original from which we can edit
edited_df = df.drop(['Text'], axis=1)
# rename country and city columns to the date and time
edited_df.rename(columns={'Country': 'Date', 'City': 'Time'}, inplace=True)

# create a total item cost column that sums the item costs 
edited_df['Total Item Cost'] = edited_df["Item 1 Cost"] + edited_df["Item 2 Cost"] + edited_df["Item 3 Cost"]

# create an average item cost column 
# since we are dividing by 3, we get a lot of repeating 3s in the decimal value
# to limit the float to only 2 decimal values use the round function (not necesary but makes it more readable)
# first argument is the float in question, second argument is the number of decimal values to be shown 
edited_df["Average Item Cost"] = round(edited_df["Total Item Cost"] / 3, 2)

# separate the Lat Lng column into two columns representing the latitude and longitude
# remove parentheses and split the LatLon column
split_data = edited_df['Lat Lng'].str.strip(')').str.strip('(').str.split(', ')

# create new columns for Latitude and Longitude
edited_df['Latitude'] = split_data.apply(lambda x: x[0])
edited_df['Longitude'] = split_data.apply(lambda x: x[1])
# drop/remove the original column that we separated out into two
edited_df.drop('Lat Lng', axis=1, inplace=True)

# other considerations
# split the name column into first name and last name columns
# depends on what the analysis is but date, time, latitude, and longitude could be dropped

edited_df

Unnamed: 0,Name,Date,Time,Item 1 Cost,Item 2 Cost,Item 3 Cost,Total Item Cost,Average Item Cost,Latitude,Longitude
0,Dominic English,"Dec 28, 2023",7:03 AM,2,3,3,8,2.67,85.7277135872,101.7198369792
1,Jocelyn Villarreal,"Aug 1, 2023",7:51 PM,9,5,7,21,7.00,62.8774593536,-152.7263755264
2,Guinevere Wynn,"Jan 6, 2023",2:02 PM,1,2,7,10,3.33,-52.7344432128,64.3580158976
3,Alexander Ramos,"Nov 19, 2023",12:12 AM,7,9,5,21,7.00,68.6334992384,83.9410893824
4,Daryl Kane,"Mar 1, 2024",8:50 PM,4,9,9,22,7.33,-89.0586281984,-139.2981598208
...,...,...,...,...,...,...,...,...,...,...
495,Basil Lane,"Feb 29, 2024",7:43 AM,4,9,9,22,7.33,-55.892100608,19.4356921344
496,Vincent Lindsey,"May 13, 2024",7:32 AM,3,0,4,7,2.33,51.5248698368,64.7653787648
497,Evan Baldwin,"Nov 24, 2023",3:23 PM,6,7,8,21,7.00,73.142357504,156.733807104
498,Martin Strong,"Mar 17, 2023",2:47 AM,9,9,9,27,9.00,64.5299096576,-81.1847581696
