<a href="https://colab.research.google.com/github/abhilashmarathe/data_science_30days/blob/main/Day2_AdvancedPython_DataLoading.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Part 1: File Handling & Modules**

In [21]:
%%writefile utils.py

def read_text(file_path):
    """Reads a text file and returns list of lines"""
    with open(file_path, 'r') as f:
        return f.readlines()

def write_text(file_path, lines_list):
    """Writes list of strings to a text file"""
    with open(file_path, 'w') as f:
        for line in lines_list:
            f.write(line)

Writing utils.py


In [22]:
from utils import read_text, write_text

# Create a sample file
with open("demo.txt", "w") as f:
    f.write("hello\nworld\npython")

# Read and modify
lines = read_text("demo.txt")
lines_upper = [line.upper() for line in lines]

# Write modified content
write_text("demo_upper.txt", lines_upper)

# Assert check
assert len(lines) == len(lines_upper), "Line count mismatch!"
print("✅ File handling works perfectly!")


✅ File handling works perfectly!


In [1]:
# Create a Python module file named utils.py and define functions: read_text(file_path) reads a .txt file and returns list of lines

def read_text(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    return lines

In [5]:
# read_text(file_path) → reads a .txt file and returns list of lines.

file_path = 'example.txt'
lines = read_text(file_path)
print(lines)

['You Are Amazing!']


In [8]:
# write_text(file_path, lines_list) → writes list to .txt.

def write_text(file_path, lines_list):
    with open(file_path, 'w') as file:
        file.writelines(lines_list)

In [10]:
# Use read_text to read a small sample .txt file of your choice (create it), then use write_text to write a modified version (e.g., uppercase all lines).

file_path = 'example.txt'
lines = read_text(file_path)
modified_lines = [line.upper() for line in lines]
write_text('modified_example.txt', modified_lines)

print(modified_lines)

['YOU ARE AMAZING!']


In [11]:
# Use assert to make sure the number of lines in input = number of lines in output.

input_file_path = 'example.txt'
output_file_path = 'modified_example.txt'

input_lines = read_text(input_file_path)
output_lines = read_text(output_file_path)

print(len(input_lines))
print(len(output_lines))
assert len(input_lines) == len(output_lines)

1
1


# **Part 2: Pandas Data Loading & Operations**

In [12]:
# Download a small CSV dataset (for example: Iris dataset or any dataset < 10k rows)

import pandas as pd

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
df = pd.read_csv(url, header=None)
df

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [15]:
# Display the first 10 rows, shape, columns, basic statistics (.describe())

df = pd.read_csv(url, header=None)
print(df.head(10))
print(df.shape)
print(df.columns)
print(df.describe())

     0    1    2    3            4
0  5.1  3.5  1.4  0.2  Iris-setosa
1  4.9  3.0  1.4  0.2  Iris-setosa
2  4.7  3.2  1.3  0.2  Iris-setosa
3  4.6  3.1  1.5  0.2  Iris-setosa
4  5.0  3.6  1.4  0.2  Iris-setosa
5  5.4  3.9  1.7  0.4  Iris-setosa
6  4.6  3.4  1.4  0.3  Iris-setosa
7  5.0  3.4  1.5  0.2  Iris-setosa
8  4.4  2.9  1.4  0.2  Iris-setosa
9  4.9  3.1  1.5  0.1  Iris-setosa
(150, 5)
Index([0, 1, 2, 3, 4], dtype='int64')
                0           1           2           3
count  150.000000  150.000000  150.000000  150.000000
mean     5.843333    3.054000    3.758667    1.198667
std      0.828066    0.433594    1.764420    0.763161
min      4.300000    2.000000    1.000000    0.100000
25%      5.100000    2.800000    1.600000    0.300000
50%      5.800000    3.000000    4.350000    1.300000
75%      6.400000    3.300000    5.100000    1.800000
max      7.900000    4.400000    6.900000    2.500000


In [17]:
# Select a subset of columns, filter rows by a condition, sort by a column

subset_df = df[[0, 1, 2]]
print(subset_df)

filtered_df = df[df[0] > 5]
print(filtered_df)

sorted_df = df.sort_values(by=0)
print(sorted_df)

       0    1    2
0    5.1  3.5  1.4
1    4.9  3.0  1.4
2    4.7  3.2  1.3
3    4.6  3.1  1.5
4    5.0  3.6  1.4
..   ...  ...  ...
145  6.7  3.0  5.2
146  6.3  2.5  5.0
147  6.5  3.0  5.2
148  6.2  3.4  5.4
149  5.9  3.0  5.1

[150 rows x 3 columns]
       0    1    2    3               4
0    5.1  3.5  1.4  0.2     Iris-setosa
5    5.4  3.9  1.7  0.4     Iris-setosa
10   5.4  3.7  1.5  0.2     Iris-setosa
14   5.8  4.0  1.2  0.2     Iris-setosa
15   5.7  4.4  1.5  0.4     Iris-setosa
..   ...  ...  ...  ...             ...
145  6.7  3.0  5.2  2.3  Iris-virginica
146  6.3  2.5  5.0  1.9  Iris-virginica
147  6.5  3.0  5.2  2.0  Iris-virginica
148  6.2  3.4  5.4  2.3  Iris-virginica
149  5.9  3.0  5.1  1.8  Iris-virginica

[118 rows x 5 columns]
       0    1    2    3               4
13   4.3  3.0  1.1  0.1     Iris-setosa
8    4.4  2.9  1.4  0.2     Iris-setosa
42   4.4  3.2  1.3  0.2     Iris-setosa
38   4.4  3.0  1.3  0.2     Iris-setosa
41   4.5  2.3  1.3  0.3     Iris-setosa
..  

In [18]:
# Write the DataFrame back to a new CSV using df.to_csv("filtered.csv", index=False)

df2 = df[df[0] > 5]
df2.to_csv("filtered.csv", index=False)

# **Part 3: Comprehensions & Advanced Structures**

In [19]:
# From the DataFrame you loaded, create a list comprehension that extracts a list of dictionaries: each dict = {column1: value1, column2: value2} for first 50 rows

df2 = df.head(50)
dict_list = [{column: value for column, value in row.items()} for _, row in df2.iterrows()]
print(dict_list)

[{0: 5.1, 1: 3.5, 2: 1.4, 3: 0.2, 4: 'Iris-setosa'}, {0: 4.9, 1: 3.0, 2: 1.4, 3: 0.2, 4: 'Iris-setosa'}, {0: 4.7, 1: 3.2, 2: 1.3, 3: 0.2, 4: 'Iris-setosa'}, {0: 4.6, 1: 3.1, 2: 1.5, 3: 0.2, 4: 'Iris-setosa'}, {0: 5.0, 1: 3.6, 2: 1.4, 3: 0.2, 4: 'Iris-setosa'}, {0: 5.4, 1: 3.9, 2: 1.7, 3: 0.4, 4: 'Iris-setosa'}, {0: 4.6, 1: 3.4, 2: 1.4, 3: 0.3, 4: 'Iris-setosa'}, {0: 5.0, 1: 3.4, 2: 1.5, 3: 0.2, 4: 'Iris-setosa'}, {0: 4.4, 1: 2.9, 2: 1.4, 3: 0.2, 4: 'Iris-setosa'}, {0: 4.9, 1: 3.1, 2: 1.5, 3: 0.1, 4: 'Iris-setosa'}, {0: 5.4, 1: 3.7, 2: 1.5, 3: 0.2, 4: 'Iris-setosa'}, {0: 4.8, 1: 3.4, 2: 1.6, 3: 0.2, 4: 'Iris-setosa'}, {0: 4.8, 1: 3.0, 2: 1.4, 3: 0.1, 4: 'Iris-setosa'}, {0: 4.3, 1: 3.0, 2: 1.1, 3: 0.1, 4: 'Iris-setosa'}, {0: 5.8, 1: 4.0, 2: 1.2, 3: 0.2, 4: 'Iris-setosa'}, {0: 5.7, 1: 4.4, 2: 1.5, 3: 0.4, 4: 'Iris-setosa'}, {0: 5.4, 1: 3.9, 2: 1.3, 3: 0.4, 4: 'Iris-setosa'}, {0: 5.1, 1: 3.5, 2: 1.4, 3: 0.3, 4: 'Iris-setosa'}, {0: 5.7, 1: 3.8, 2: 1.7, 3: 0.3, 4: 'Iris-setosa'}, {0: 5.1, 1:

In [20]:
# Create a dictionary comprehension: keys = unique values from one column, values = count of each unique value

unique_values = df[0].unique()
value_counts = {value: df[df[0] == value].shape[0] for value in unique_values}
print(value_counts)

{np.float64(5.1): 9, np.float64(4.9): 6, np.float64(4.7): 2, np.float64(4.6): 4, np.float64(5.0): 10, np.float64(5.4): 6, np.float64(4.4): 3, np.float64(4.8): 5, np.float64(4.3): 1, np.float64(5.8): 7, np.float64(5.7): 8, np.float64(5.2): 4, np.float64(5.5): 7, np.float64(4.5): 1, np.float64(5.3): 1, np.float64(7.0): 1, np.float64(6.4): 7, np.float64(6.9): 4, np.float64(6.5): 5, np.float64(6.3): 9, np.float64(6.6): 2, np.float64(5.9): 3, np.float64(6.0): 6, np.float64(6.1): 6, np.float64(5.6): 6, np.float64(6.7): 8, np.float64(6.2): 4, np.float64(6.8): 3, np.float64(7.1): 1, np.float64(7.6): 1, np.float64(7.3): 1, np.float64(7.2): 3, np.float64(7.7): 4, np.float64(7.4): 1, np.float64(7.9): 1}
