# Pandas

* Pandas is a Python library used for working with data sets.
* It has functions for analyzing, cleaning, exploring, and manipulating data.
* The name "Pandas" has a reference to both "Panel Data", and "Python Data Analysis" and was created by Wes McKinney in 2008.

In [None]:
import pandas as pd

mydataset = {"cars": ["BMW", "Volvo", "Ford"], "passings": [3, 7, 2]}

mydf = pd.DataFrame(mydataset)

print(mydf)

In [None]:
mydf["passings"]

A Pandas Series is like a column in a table.

It is a one-dimensional array holding data of any type.

In [None]:
a = [1, 7, 2]
myvar = pd.Series(a)
print(myvar)


In [None]:
# access the first element
print(myvar[0])

In [None]:
# with defined index
a = [1, 7, 2]
myvar = pd.Series(a, index=["x", "y", "z"])
print(myvar)

In [None]:
# access the element with index label
print(myvar["y"])

In [None]:
# create a series from a dictionary where the keys will be used as index
calories = {"day1": 420, "day2": 380, "day3": 390}
myvar = pd.Series(calories)
print(myvar)


In [None]:
# create a Series using only data from "day1" and "day2"
calories = {"day1": 420, "day2": 380, "day3": 390}
myvar = pd.Series(calories, index=["day1", "day2"])
print(myvar)


In [None]:
data = {"calories": [420, 380, 390], "duration": [50, 40, 45]}
df = pd.DataFrame(data)
# df = pd.DataFrame(data, index=["day1", "day2", "day3"])
print(df)


In [None]:
# loc uses named index
print(df.loc[0])
# print(df.loc["day1"])

In [None]:
# to include multiple rows
print(df.loc[[0, 1]])
# print(df.loc[["day1", "day2"]])

In [None]:
# iloc for using indexes instead of labels
df = pd.DataFrame(data, index=["day1", "day2", "day3"])
print(df.iloc[0:2])

In [None]:
import numpy as np


data = {"calories": [420, 380, 390], "duration": [50, 40, 45]}
df = pd.DataFrame(data)

selected_df1 = df.loc[[0, 1]]
selected_df2 = df.iloc[0:2]

print(
    f"The df and selected_df1 share memory using 'loc' method: {np.shares_memory(selected_df1, df)}"
)
print(
    f"The df and selected_df2 share memory using 'iloc' method: {np.shares_memory(selected_df2, df)}"
)
print(f"The selected_df1 is a view: {selected_df1._is_view}")
print(f"The selected_df2 is a view: {selected_df2._is_view}")

In [None]:
# this behaviour is not guaranteed for mixed data types
data = {
    "calories": [420, 380, 390],
    "duration": ["50", "40", "45"],
}

df = pd.DataFrame(data)

selected_df1 = df.loc[[0, 1]]
selected_df2 = df.iloc[0:2]

print(
    f"The df and selected_df1 share memory using 'loc' method: {np.shares_memory(selected_df1, df)}"
)
print(
    f"The df and selected_df2 share memory using 'iloc' method: {np.shares_memory(selected_df2, df)}"
)
print(f"The selected_df1 is a view: {selected_df1._is_view}")
print(f"The selected_df2 is a view: {selected_df2._is_view}")


In [None]:
# sample data with nan values
df = pd.read_csv("sample_nan.csv")
df

In [None]:
# check for missing values
print(df.isnull())

In [None]:
# check for the number of missing values
np.sum(df.isnull(), axis=0)

In [None]:
print(df["point"].isnull())

In [None]:
np.sum(df["point"].isnull())

In [None]:
new_df = df.dropna()
new_df

In [None]:
df

In [None]:
# inplace=True will change the original dataframe
df.dropna(inplace=True)
df

In [None]:
df = pd.read_csv("sample_nan.csv")
df


In [None]:
# fill all missing values with 0
df.fillna(0, inplace=True)
df

In [None]:
# fill missing values specifically for each column
df = pd.read_csv("sample_nan.csv")
df.fillna({"name": "No Name", "point": 0.0, "age": 30.0, "state": "DC"}, inplace=True)
newdf = df[["name", "point", "age", "state"]].copy()
newdf

In [None]:
# load data from url
url = "https://github.com/YBI-Foundation/Dataset/raw/refs/heads/main/Diabetes%20Missing%20Data.csv"
df = pd.read_csv(url)
df

In [None]:
df.info()

In [None]:
df.isnull()


In [None]:
df.isnull().sum(axis=0)

In [None]:
df.dropna()

In [None]:
# fill missing values with mean
SI_mean = df["Serum_Insulin"].mean()
df.fillna({"Serum_Insulin": SI_mean}, inplace=True)
df.info()

In [None]:
# fill missing values with median
SF_median = df["Skin_Fold"].median()
df.fillna({"Skin_Fold": SF_median}, inplace=True)
df.info()

In [None]:
# fill missing values with mode
# mode is the value that appears most frequently
BMI_mode = df["BMI"].mode()[0]
print(BMI_mode)
df.fillna({"BMI": BMI_mode}, inplace=True)
df.info()

In [None]:
df

In [None]:
# you can see all mode values
df.mode()

In [None]:
df["Diabetes_Pedigree"].loc[df["Diabetes_Pedigree"] > 1].info()


In [None]:
# replace all values in "Diabetes_Pedigree" column that are greater than 1 with 1
df.loc[df["Diabetes_Pedigree"] > 1, "Diabetes_Pedigree"] = 1
df

In [None]:
# duplicate a row
newdf = pd.concat([df, df.iloc[0:1]], ignore_index=True)
newdf

In [None]:
# to check for duplicates
# duplicated shows the duplicate rows only
newdf.duplicated()
# np.sum(newdf.duplicated())

In [None]:
# load data from url
url = "https://github.com/Opensourcefordatascience/Data-sets/raw/refs/heads/master/automotive_data.csv"
df = pd.read_csv(url)
df.info()

In [None]:
np.sum(df.duplicated())


In [None]:
# number of unique values in each column
df.nunique()

In [None]:
# unique values in a column
df["price"].unique()

In [None]:
# replace all "?" with "NaN"
df.replace("?", np.nan, inplace=True)
df.dropna(inplace=True)
df.info()

In [None]:
df

In [None]:
# convert the problematic columns to numeric
df["price"] = pd.to_numeric(df["price"])
df["horsepower"] = pd.to_numeric(df["horsepower"])
df["peak-rpm"] = pd.to_numeric(df["peak-rpm"])
df.info()

In [None]:
# if you need to discard non-numeric columns
newdf = df._get_numeric_data()
newdf.info()


In [None]:
newdf.corr().style.background_gradient(cmap="coolwarm")

The Result of the `corr()` method is a table with a lot of numbers that represents how well the relationship is between two columns.

* The number varies from $-1$ to $1$.

* $1$ means that there is a 1 to 1 relationship (a perfect correlation), and for this data set, each time a value went up in the first column, the other one went up as well.

* $0.9$ is also a good relationship, and if you increase one value, the other will probably increase as well.

* $-0.9$ would be just as good relationship as $0.9$, but if you increase one value, the other will probably go down.

* $0.2$ means NOT a good relationship, meaning that if one value goes up does not mean that the other will.