# Working with data files using Pandas Library

In [7]:
import csv

with open("Garbage_data/weather_data.csv") as file:
    data = csv.reader(file) # returns an object
    temperatures = []
    for row in data:
        if row[1] != "temp":
            temperatures.append(int(row[1]))
        print(row)
    print(temperatures)
    

['day', 'temp', 'condition']
['Monday', '12', 'Sunny']
['Tuesday', '14', 'Rain']
['Wednesday', '15', 'Rain']
['Thursday', '14', 'Cloudy']
['Friday', '21', 'Sunny']
['Saturday', '22', 'Sunny']
['Sunday', '24', 'Sunny']
[12, 14, 15, 14, 21, 22, 24]


In [35]:
import pandas as pd

data = pd.read_csv("Garbage_data/weather_data.csv")
print(data)

         day  temp condition
0     Monday    12     Sunny
1    Tuesday    14      Rain
2  Wednesday    15      Rain
3   Thursday    14    Cloudy
4     Friday    21     Sunny
5   Saturday    22     Sunny
6     Sunday    24     Sunny


In [None]:
print(data["temp"])
print(type(data["temp"])) # Every column is an object of Series
print(type(data)) # data is an object of DataFrame object

0    12
1    14
2    15
3    14
4    21
5    22
6    24
Name: temp, dtype: int64
<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>


In [None]:
data_dict = data.to_dict()
print(data_dict) # you can even convert this into a dictionary

{'day': {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}, 'temp': {0: 12, 1: 14, 2: 15, 3: 14, 4: 21, 5: 22, 6: 24}, 'condition': {0: 'Sunny', 1: 'Rain', 2: 'Rain', 3: 'Cloudy', 4: 'Sunny', 5: 'Sunny', 6: 'Sunny'}}


In [15]:
temp_list = data["temp"].to_list()
temp_list

[12, 14, 15, 14, 21, 22, 24]

In [16]:
# calc avg
sum(temp_list) / len(temp_list)

17.428571428571427

In [None]:
# or you can do it this way
data["temp"].mean() # remember .mean is just a method part of series object

np.float64(17.428571428571427)

In [None]:
# calc the max
int(data["temp"].max())

np.int64(24)

In [None]:
print(data["condition"])
print(data.condition) # as you can see, pandas converts each column into an attribute of Series class

0     Sunny
1      Rain
2      Rain
3    Cloudy
4     Sunny
5     Sunny
6     Sunny
Name: condition, dtype: object
0     Sunny
1      Rain
2      Rain
3    Cloudy
4     Sunny
5     Sunny
6     Sunny
Name: condition, dtype: object


But how to get datas that are in the rows instead of columns?

In [23]:
type(data[data["day"] == "Monday"]) # as you can see, it returns dataframe
data[data["day"] == "Monday"]

Unnamed: 0,day,temp,condition
0,Monday,12,Sunny


In [36]:
# but what does inside mean
print(data["day"] == "Monday") # as you can see, it returns a Series object of boolean values. When you put this inside a dataframe, it returns the row of dataframe objectv
print(type(data["day"] == "Monday")) # it returns a Series

0     True
1    False
2    False
3    False
4    False
5    False
6    False
Name: day, dtype: bool
<class 'pandas.core.series.Series'>


In [29]:
monday = data[data.day == "Monday"]
monday.condition

0    Sunny
Name: condition, dtype: object

In [31]:
monday_temp = monday.temp[0] # extract the actual value from the series object
print(type(monday_temp)) 
monday_temp_f = monday_temp * 9/5  + 32
print(monday_temp_f)

<class 'numpy.int64'>
53.6


In [None]:
# Create a dataframe from scratch
data_dict = {
    "students" : ["Amy", "James", "Angela"],
    "scores" : [76, 56, 65]
}
data = pd.DataFrame(data_dict)
print(data) # we created a dataframe

  students  scores
0      Amy      76
1    James      56
2   Angela      65


In [33]:
# now, we can actually 
data.to_csv("Garbage_data/new_data.csv") # conver it into a csv file called new_data.csv

Now, let's apply what we have learned in `pandas` to a file that talks about squirrals that I got from [this link](https://data.cityofnewyork.us/Environment/2018-Central-Park-Squirrel-Census-Squirrel-Data/vfnx-vebw/about_data).

In [59]:
import pandas as pd
data = pd.read_csv("Garbage_data/2018_Central_Park_Squirrel_Census_-_Squirrel_Data_20250605.csv") # again, this return a DataFrame object

In [60]:
type(data[data["Primary Fur Color"] == "Cinnamon"])

pandas.core.frame.DataFrame

In [55]:
count = []
colors = ["Gray", "Cinnamon", "Black"]
for color in colors:
    count.append(len(data[data["Primary Fur Color"] == color]))

In [57]:
summary_dict = {
    "Fur Color" : colors,
    "Count" : count
}
summary_data = pd.DataFrame(summary_dict)
summary_data

Unnamed: 0,Fur Color,Count
0,Gray,2473
1,Cinnamon,392
2,Black,103


In [58]:
summary_data.to_csv("Garbage_data/squiral_summary.csv")

Now, let's apply what we have learned to a project which is a game that tests your knowledge on US states.

In [None]:
import pandas as pd
import turtle as tr

screen = tr.Screen()
screen.title("US States Game")
image = "Garbage_data/blank_states_img.gif"
screen.addshape(image)

tr.shape(image)

data = pd.read_csv("Garbage_data/50_states.csv")
all_states = data.state.to_list()

guessed_states = []
while len(guessed_states) < 50:

    answer_state = screen.textinput(title=f"{len(guessed_states)}/50", 
                                    prompt="What's another state's name?").title() # answer stored here


    if answer_state == "Exit" :
        missing_states = []
        for state in all_states:
            if state not in guessed_states:
                missing_states.append(state)

        new_data = pd.DataFrame(missing_states)
        new_data.to_csv("Garbage_data/states_to_learn.csv")
        break

    if answer_state in all_states : # checks to see if something is inside another list ; only works in lists
        guessed_states.append(answer_state)
        t = tr.Turtle()
        t.hideturtle()
        t.penup()
        state_data = data[data.state == answer_state] # pull out the row (DataFrame)
        t.goto(x=state_data.x.item(), y=state_data.y.item())
        t.write(state_data.state.item()) # item() is a method in Series that grabs the first element

# States that are missed turned into .csv file


Here is a little lesson on `Series.item()`.

In [66]:
import pandas as pd
data = pd.read_csv("Garbage_data/50_states.csv")
# data["state"].item() # produces an error: the result must be of size 1

## List and Dictionary Comprehension

We are going to be learning more about lists and dictionaries.

What is a list comprehension? This is something only unique to the python language. It is a way to create a new list out of another list with less typing than usual.

In [None]:
numbers = [1, 2, 3]
new_numbers = [n + 1 for n in numbers] # this is how to use it
new_numbers

[2, 3, 4]

In [69]:
name = "Angela"
letters_list = [letter for letter in name] # this also works in strings, not just lists
letters_list

['A', 'n', 'g', 'e', 'l', 'a']

In [70]:
range_list = [new_item * 2 for new_item in range(1,5)]
range_list

[2, 4, 6, 8]

In [72]:
type(range(1,5))

range

In [None]:
names = ["Alex", "Beth", "Caroline", "Dave", "Eleanor", "Freddie"]
short_names = [n for n in names if len(n) < 5] # as you can see, we can also add conditions here
short_names

['Alex', 'Beth', 'Dave']

In [75]:
uppercase_long_names = [n.upper() for n in names if len(n) > 5]
uppercase_long_names

['CAROLINE', 'ELEANOR', 'FREDDIE']

Now, we are going to learn about dictionary comprehension.

In [86]:
import random
student_grades = {student : random.randint(0, 100) for student in names} # names here could be any sort of iterable: list, string, tuple, etc.
student_grades

{'Alex': 53,
 'Beth': 80,
 'Caroline': 10,
 'Dave': 40,
 'Eleanor': 91,
 'Freddie': 13}

In [87]:
# remember
print(student_grades.items())
print(type(student_grades.items()))


dict_items([('Alex', 53), ('Beth', 80), ('Caroline', 10), ('Dave', 40), ('Eleanor', 91), ('Freddie', 13)])
<class 'dict_items'>


In [88]:
passed_students = {student : grade for (student, grade) in student_grades.items() if grade >= 60}
passed_students

{'Beth': 80, 'Eleanor': 91}

How to iterate over Pandas datatypes?

In [None]:
student_dict = {
    "students" : ["Angela", "James", "Lily"],
    "scores" : [56, 76, 98]
}

for (key, value) in student_dict.items():
    print(value)


import pandas as pd

student_df = pd.DataFrame(student_dict)

# loop through
for (key, value) in student_df.items():
    print(key)
    print(value)


# loop through rows and not columns
for (index, row) in student_df.iterrows():
    print(index)
    print(row)


    print(row.students) # since row is a series, we can just access it like this
    print(row.scores)

['Angela', 'James', 'Lily']
[56, 76, 98]
students
0    Angela
1     James
2      Lily
Name: students, dtype: object
scores
0    56
1    76
2    98
Name: scores, dtype: int64
0
students    Angela
scores          56
Name: 0, dtype: object
<class 'pandas.core.series.Series'>
Angela
56
1
students    James
scores         76
Name: 1, dtype: object
<class 'pandas.core.series.Series'>
James
76
2
students    Lily
scores        98
Name: 2, dtype: object
<class 'pandas.core.series.Series'>
Lily
98


Now, let's do our project using NATO alphabet.