## 1. Create a pandas series from each of the items below: a list, numpy and a dictionary.

In [14]:
import pandas as pd
import numpy as np

# Create a pandas series from a list
mylist = list('abcedfghijklmnopqrstuvwxyz')
series_from_list = pd.Series(mylist)
print("Series from list:")
display(series_from_list)

# Create a pandas series from a numpy array
myarr = np.arange(26)
series_from_array = pd.Series(myarr)
print("\nSeries from numpy array:")
display(series_from_array)

# Create a pandas series from a dictionary
mydict = dict(zip(mylist, myarr))
series_from_dict = pd.Series(mydict)
print("\nSeries from dictionary:")
display(series_from_dict)

Series from list:


Unnamed: 0,0
0,a
1,b
2,c
3,e
4,d
5,f
6,g
7,h
8,i
9,j



Series from numpy array:


Unnamed: 0,0
0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9



Series from dictionary:


Unnamed: 0,0
a,0
b,1
c,2
e,3
d,4
f,5
g,6
h,7
i,8
j,9


## 2. Convert the series ser into a dataframe with its index as another column on the dataframe.

In [15]:
ser = pd.Series(mydict)
df_from_series = ser.to_frame().reset_index()
df_from_series.columns = ['Index', 'Value'] # Renaming columns for clarity
print("\nDataFrame from series with index as a column:")
display(df_from_series)


DataFrame from series with index as a column:


Unnamed: 0,Index,Value
0,a,0
1,b,1
2,c,2
3,e,3
4,d,4
5,f,5
6,g,6
7,h,7
8,i,8
9,j,9


## 3. Combine ser1 and ser2 to form a dataframe.

In [16]:
ser1 = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser2 = pd.Series(np.arange(26))
df_combined = pd.DataFrame({'col1': ser1, 'col2': ser2})
print("\nDataFrame from combining two series:")
display(df_combined)


DataFrame from combining two series:


Unnamed: 0,col1,col2
0,a,0
1,b,1
2,c,2
3,e,3
4,d,4
5,f,5
6,g,6
7,h,7
8,i,8
9,j,9


## 4. Give a name to the series ser calling it ‘alphabets’.

In [17]:
ser = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser.name = 'alphabets'
print("\nSeries with a name:")
display(ser)


Series with a name:


Unnamed: 0,alphabets
0,a
1,b
2,c
3,e
4,d
5,f
6,g
7,h
8,i
9,j


## 5. From ser1 remove items present in ser2.

In [18]:
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

# Items in ser1 that are not in ser2
items_to_keep = ~ser1.isin(ser2)
ser1_cleaned = ser1[items_to_keep]
print("Items in ser1 not present in ser2:")
display(ser1_cleaned)

Items in ser1 not present in ser2:


Unnamed: 0,0
0,1
1,2
2,3


## 6. Get all items of ser1 and ser2 not common to both.

In [19]:
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

# Items in ser1 not in ser2
uncommon_ser1 = ser1[~ser1.isin(ser2)]

# Items in ser2 not in ser1
uncommon_ser2 = ser2[~ser2.isin(ser1)]

# Concatenate the two series to get all uncommon items
uncommon_items_series = pd.concat([uncommon_ser1, uncommon_ser2])

print("Items not common to both ser1 and ser2:")
display(uncommon_items_series)

Items not common to both ser1 and ser2:


Unnamed: 0,0
0,1
1,2
2,3
2,6
3,7
4,8


## 7. Compute the minimum, 25th percentile, median, 75th, and maximum of ser.

In [20]:
ser = pd.Series(np.random.normal(10, 5, 25))

min_value = ser.min()
percentiles = ser.quantile([0.25, 0.5, 0.75])
median_value = percentiles[0.5]
max_value = ser.max()

print("Minimum:", min_value)
print("25th percentile:", percentiles[0.25])
print("Median:", median_value)
print("75th percentile:", percentiles[0.75])
print("Maximum:", max_value)

Minimum: 1.093239870720712
25th percentile: 7.233503655104858
Median: 11.006578892025631
75th percentile: 13.21380036954948
Maximum: 18.29018137451378


## 8. Calculate the frequency counts of each unique value ser.

In [21]:
ser = pd.Series(np.take(list('abcdefgh'), np.random.randint(8, size=30)))

value_counts = ser.value_counts()

print("Frequency counts of unique values:")
display(value_counts)

Frequency counts of unique values:


Unnamed: 0,count
c,6
h,6
b,5
d,4
f,3
a,3
e,2
g,1


## 9. From ser, keep the top 2 most frequent items as it is and replace everything else as ‘Other’.

In [22]:
np.random.RandomState(100)
ser = pd.Series(np.random.randint(1, 5, [12]))

# Get the top 2 most frequent items
top_2_items = ser.value_counts().nlargest(2).index.tolist()

# Replace items not in the top 2 with 'Other'
ser_replaced = ser.apply(lambda x: x if x in top_2_items else 'Other')

print("Series with top 2 most frequent items and 'Other' for others:")
display(ser_replaced)

Series with top 2 most frequent items and 'Other' for others:


Unnamed: 0,0
0,4
1,Other
2,2
3,2
4,4
5,4
6,Other
7,2
8,4
9,Other


## 10. Bin the series ser into 10 equal deciles and replace the values with the bin name.

In [23]:
ser = pd.Series(np.random.random(20))

# Bin the series into 10 equal deciles
ser_binned = pd.qcut(ser, q=10, labels=False, duplicates='drop')

# Replace with bin names (e.g., 'Decile_0', 'Decile_1', etc.)
ser_binned_named = ser_binned.apply(lambda x: f'Decile_{x}')

print("Series binned into 10 deciles with bin names:")
display(ser_binned_named)

Series binned into 10 deciles with bin names:


Unnamed: 0,0
0,Decile_1
1,Decile_3
2,Decile_0
3,Decile_5
4,Decile_8
5,Decile_7
6,Decile_3
7,Decile_4
8,Decile_6
9,Decile_9


## 11. Reshape the series ser into a dataframe with 7 rows and 5 columns.

In [24]:
ser = pd.Series(np.random.randint(1, 10, 35))

# Reshape the series into a DataFrame
df_reshaped = pd.DataFrame(ser.values.reshape(7, 5))

print("Reshaped Series into a DataFrame:")
display(df_reshaped)

Reshaped Series into a DataFrame:


Unnamed: 0,0,1,2,3,4
0,8,2,1,2,6
1,9,4,3,7,9
2,9,1,8,8,6
3,9,4,2,5,5
4,7,3,8,9,8
5,6,2,2,7,6
6,6,7,3,3,7


## 12. Find the positions of numbers that are multiples of 3 from ser.

In [25]:
ser = pd.Series(np.random.randint(1, 10, 7))

# Find the positions of numbers that are multiples of 3
positions = [i for i, x in enumerate(ser) if x % 3 == 0]

print("Positions of multiples of 3:")
print(positions)

Positions of multiples of 3:
[2, 3, 5]


## 13. From ser, extract the items at positions in list pos.

In [26]:
ser = pd.Series(list('abcdefghijklmnopqrstuvwxyz'))
pos = [0, 4, 8, 14, 20]

# Extract items at specified positions
extracted_items = ser[pos]

print("Items extracted at specified positions:")
display(extracted_items)

Items extracted at specified positions:


Unnamed: 0,0
0,a
4,e
8,i
14,o
20,u


## 14. Stack ser1 and ser2 vertically and horizontally (to form a dataframe).

In [27]:
ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))

# Stack vertically
df_vertical = pd.concat([ser1, ser2], axis=0)
print("Stacked vertically:")
display(df_vertical)

# Stack horizontally
df_horizontal = pd.concat([ser1, ser2], axis=1)
print("\nStacked horizontally:")
display(df_horizontal)

Stacked vertically:


Unnamed: 0,0
0,0
1,1
2,2
3,3
4,4
0,a
1,b
2,c
3,d
4,e



Stacked horizontally:


Unnamed: 0,0,1
0,0,a
1,1,b
2,2,c
3,3,d
4,4,e


## 15. Get the positions of items of ser2 in ser1 as a list

In [28]:
ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])

# Get the positions of items of ser2 in ser1
positions_in_ser1 = [i for i, item_ser2 in enumerate(ser2) for j, item_ser1 in enumerate(ser1) if item_ser2 == item_ser1]

print("Positions of items from ser2 in ser1:")
print(positions_in_ser1)

Positions of items from ser2 in ser1:
[0, 1, 2, 3]


## 16. Compute the mean squared error of truth and pred series.

In [29]:
truth = pd.Series(range(10))
pred = pd.Series(range(10)) + np.random.random(10)

# Compute the mean squared error
mean_squared_error = np.mean((truth - pred)**2)

print("Mean Squared Error:", mean_squared_error)

Mean Squared Error: 0.46197218973804466


## 17. Change the first character of each word to upper case in each word of ser.

In [30]:
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

# Change the first character of each word to upper case
ser_capitalized = ser.str.capitalize()

print("Series with first character capitalized:")
display(ser_capitalized)

Series with first character capitalized:


Unnamed: 0,0
0,How
1,To
2,Kick
3,Ass?


## 18. How to calculate the number of characters in each word in a series?

In [31]:
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

# Calculate the number of characters in each word
char_counts = ser.str.len()

print("Number of characters in each word:")
display(char_counts)

Number of characters in each word:


Unnamed: 0,0
0,3
1,2
2,4
3,4


## 19. How to convert a series of date-strings to a timeseries?

In [34]:
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

# Convert to datetime series with mixed formats and error handling
timeseries = pd.to_datetime(ser, errors='coerce', format='mixed')

print("Converted to timeseries:")
display(timeseries)

Converted to timeseries:


Unnamed: 0,0
0,2010-01-01 00:00:00
1,2011-02-02 00:00:00
2,2012-03-03 00:00:00
3,2013-04-04 00:00:00
4,2014-05-05 00:00:00
5,2015-06-06 12:20:00


## 20. Get the day of month, week number, day of year and day of week from ser.

In [35]:
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

# Convert to datetime series first, handling mixed formats
timeseries = pd.to_datetime(ser, errors='coerce', format='mixed')

# Get day of month
day_of_month = timeseries.dt.day
print("\nDay of month:")
display(day_of_month)

# Get week number
week_number = timeseries.dt.isocalendar().week
print("\nWeek number:")
display(week_number)

# Get day of year
day_of_year = timeseries.dt.dayofyear
print("\nDay of year:")
display(day_of_year)

# Get day of week (Monday=0, Sunday=6)
day_of_week = timeseries.dt.dayofweek
print("\nDay of week:")
display(day_of_week)


Day of month:


Unnamed: 0,0
0,1
1,2
2,3
3,4
4,5
5,6



Week number:


Unnamed: 0,week
0,53
1,5
2,9
3,14
4,19
5,23



Day of year:


Unnamed: 0,0
0,1
1,33
2,63
3,94
4,125
5,157



Day of week:


Unnamed: 0,0
0,4
1,2
2,5
3,3
4,0
5,5


## 21. Change ser to dates that start with 4th of the respective months.

In [36]:
ser = pd.Series(['Jan 2010', 'Feb 2011', 'Mar 2012'])

# Change to dates that start with the 4th of the respective months
dates_with_4th = pd.to_datetime(ser, format='%b %Y').apply(lambda x: x.replace(day=4))

print("Series with dates set to the 4th of the month:")
display(dates_with_4th)

Series with dates set to the 4th of the month:


Unnamed: 0,0
0,2010-01-04
1,2011-02-04
2,2012-03-04


## 22. From ser, extract words that contain atleast 2 vowels.

In [37]:
ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])

# Function to count vowels in a word
def count_vowels(word):
    vowels = 'aeiouAEIOU'
    return sum(1 for char in word if char in vowels)

# Extract words with at least 2 vowels
words_with_2_vowels = ser[ser.apply(count_vowels) >= 2]

print("Words with at least 2 vowels:")
display(words_with_2_vowels)

Words with at least 2 vowels:


Unnamed: 0,0
0,Apple
1,Orange
4,Money


## 23. Extract the valid emails from the series emails. The regex pattern for valid emails is provided as reference.

In [38]:
import re

emails = pd.Series(['buying books at amazom.com', 'rameses@egypt.com', 'matt@t.co', 'narendra@modi.com'])
pattern ='[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'

# Extract valid emails using the regex pattern
valid_emails = emails[emails.str.match(pattern)]

print("Valid emails:")
display(valid_emails)

Valid emails:


Unnamed: 0,0
1,rameses@egypt.com
2,matt@t.co
3,narendra@modi.com


## 24. Compute the mean of weights of each fruit.

In [39]:
fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10))
weights = pd.Series(np.linspace(1, 10, 10))

# Create a DataFrame from the two series
df_fruit_weights = pd.DataFrame({'fruit': fruit, 'weights': weights})

# Compute the mean weight for each fruit
mean_weights_by_fruit = df_fruit_weights.groupby('fruit')['weights'].mean()

print("Mean weights of each fruit:")
display(mean_weights_by_fruit)

Mean weights of each fruit:


Unnamed: 0_level_0,weights
fruit,Unnamed: 1_level_1
apple,5.0
banana,5.5
carrot,6.0


## 25. Compute the euclidean distance between series (points) p and q.

In [40]:
from scipy.spatial.distance import euclidean

p = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
q = pd.Series([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])

# Compute the euclidean distance
euclidean_dist = euclidean(p, q)

print("Euclidean distance between p and q:", euclidean_dist)

Euclidean distance between p and q: 18.16590212458495


## 26. Get the positions of peaks (values surrounded by smaller values on both sides) in ser.

In [41]:
ser = pd.Series([2, 10, 3, 4, 9, 10, 2, 7, 3])

peaks = [i for i in range(1, len(ser) - 1) if ser[i] > ser[i-1] and ser[i] > ser[i+1]]

print("Positions of peaks:")
print(peaks)

Positions of peaks:
[1, 5, 7]
