# Pandas & Polars: Exercise Results

## 1. Create a DataFrame
- Create a simple DataFrame using both Pandas and Polars. This helps you understand the syntax differences and similarities between the two libraries for creating tabular data.


In [None]:
# Pandas
df_pd = pd.DataFrame({'name': ['Alice', 'Bob', 'Charlie'], 'score': [85, 92, 78]})
print(df_pd)


In [None]:

# Polars
df_pl = pl.DataFrame({'name': ['Alice', 'Bob', 'Charlie'], 'score': [85, 92, 78]})
print(df_pl)


## 2. Read CSV
- Practice reading data from a CSV file using both Pandas and Polars. This is a fundamental skill for importing real-world datasets.


In [None]:
# Pandas
students_pd = pd.read_csv('students.csv')


In [None]:

# Polars
students_pl = pl.read_csv('students.csv')



## 3. Inspect Data
- Explore how to view the first few rows, check the column names, and get a quick overview of your data. This step is crucial for understanding the structure and contents of your dataset.


In [None]:

print(students_pd.head(3))
print(students_pd.columns)
print(students_pl.head(3))
print(students_pl.columns)

## 4. Filter Data
- Apply filters to your DataFrame to select rows based on conditions (e.g., scores above a certain threshold). Filtering is essential for focusing on relevant data.


In [None]:

# Pandas
print(students_pd[students_pd['score'] > 80])


In [None]:

# Polars
print(students_pl.filter(students_pl['score'] > 80))


## 5. Add a Column
- Add a new column to your DataFrame based on existing data (e.g., a 'passed' column that is True if score >= 60). This exercise demonstrates how to derive new information from your data.


In [None]:

# Pandas
students_pd['passed'] = students_pd['score'] >= 60


In [None]:

# Polars
students_pl = students_pl.with_columns([
    (students_pl['score'] >= 60).alias('passed')
])


## 6. Group and Aggregate
- Group your data by a column and calculate aggregate statistics (like mean or sum). Grouping and aggregation are key operations for summarizing and analyzing data.


In [None]:

# Pandas
print(students_pd.groupby('passed').size())


In [None]:

# Polars
print(students_pl.groupby('passed').count())


## 7. Handle Missing Data
- Learn how to identify and handle missing values in your data. This is important for data cleaning and ensuring analysis accuracy.


In [None]:

# Pandas
mean_score_pd = students_pd['score'].mean()
students_pd['score'] = students_pd['score'].fillna(mean_score_pd)


In [None]:

# Polars
mean_score_pl = students_pl['score'].mean()
students_pl = students_pl.with_columns([
    pl.col('score').fill_null(mean_score_pl)
])




## 8. Merge DataFrames
- Combine two DataFrames using a common column. Merging is vital for bringing together related datasets.


In [None]:

# Pandas
emails_pd = pd.DataFrame({'name': ['Alice', 'Bob', 'Charlie'], 'email': ['alice@email.com', 'bob@email.com', 'charlie@email.com']})
merged_pd = pd.merge(students_pd, emails_pd, on='name')
print(merged_pd)


In [None]:

# Polars
emails_pl = pl.DataFrame({'name': ['Alice', 'Bob', 'Charlie'], 'email': ['alice@email.com', 'bob@email.com', 'charlie@email.com']})
merged_pl = students_pl.join(emails_pl, on='name')
print(merged_pl)

## 9. Write to CSV
- Export your processed DataFrame to a new CSV file. Writing data is necessary for saving results and sharing them with others.

In [None]:

# Pandas
students_pd.to_csv('output.csv', index=False)


In [None]:

# Polars
students_pl.write_csv('output.csv')



---

### Challenge
- Apply what you’ve learned to a larger or more complex dataset, or try optimizing your code for performance using Polars.


In [None]:

# Pandas
import time
import numpy as np

rows = 1_000_000
data = {'a': np.random.randint(0, 100, rows), 'b': np.random.randint(0, 100, rows)}
start = time.time()
pd_df = pd.DataFrame(data)
pd_df['c'] = pd_df['a'] + pd_df['b']
pandas_time = time.time() - start
print(f'Pandas time: {pandas_time:.2f}s')


In [None]:

start = time.time()
pl_df = pl.DataFrame(data)
pl_df = pl_df.with_columns((pl_df['a'] + pl_df['b']).alias('c'))
polars_time = time.time() - start
print(f'Polars time: {polars_time:.2f}s')




(Polars is typically much faster for large datasets)
