In [1]:
import pandas as pd
import numpy as np
import timeit

pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')
pd.reset_option('display.width')

def print_series(sr: pd.Series):
    print(sr.to_string(max_rows=10, dtype=True))
    print()

def test_solutions(*solutions, df):
    for idx, solution in enumerate(solutions):
        def timed_solution():
            solution(df.copy())
            
        time = timeit.timeit(timed_solution, number=1)
        print(f"Solution {idx + 1} Time: {time:.6f} seconds")

##### 1. Write a Pandas program to convert all the string values to upper, lower cases in a given pandas series. Also find the length of the string values.

In [12]:
sr = pd.Series(['Java', 'Scala', 'Python'])
print("Original series:")
print(sr)

print('\nUPPER case:')
print(sr.str.upper())

print('\nlower case:')
print(sr.str.lower())

print('\nLength of strings')
print(sr.str.len())

Original series:
0      Java
1     Scala
2    Python
dtype: object

UPPER case:
0      JAVA
1     SCALA
2    PYTHON
dtype: object

lower case:
0      java
1     scala
2    python
dtype: object

Length of strings
0    4
1    5
2    6
dtype: int64


##### 2. Write a Pandas program to remove whitespaces, left sided whitespaces and right sided whitespaces of the string values of a given pandas series.

In [26]:
sr = pd.Series([' Java  ', '   Scala', 'Py   thon   '])
print("Original series:")
print(sr)

print(sr.str.len())

print("\nRemove whitespaces:")
result = sr.str.replace(' ', '')
print(result)
print(result.str.len())
print()
result = sr.str.strip()
print(result)
print(result.str.len())


print("\nRemove left sided whitespaces:")
result = sr.str.lstrip()
print(result)
print(result.str.len())

print("\nRemove right sided whitespaces:")
result = sr.str.rstrip()
print(result)
print(result.str.len())

Original series:
0          Java  
1           Scala
2    Py   thon   
dtype: object
0     7
1     8
2    12
dtype: int64

Remove whitespaces:
0      Java
1     Scala
2    Python
dtype: object
0    4
1    5
2    6
dtype: int64

0         Java
1        Scala
2    Py   thon
dtype: object
0    4
1    5
2    9
dtype: int64

Remove left sided whitespaces:
0          Java  
1           Scala
2    Py   thon   
dtype: object
0     6
1     5
2    12
dtype: int64

Remove right sided whitespaces:
0         Java
1        Scala
2    Py   thon
dtype: object
0    5
1    8
2    9
dtype: int64


In [28]:
color1 = pd.Index([' Green', 'Black ', ' Red ', 'White', ' Pink '])

print("Original series:")
print(color1)

print("\nRemove whitespace")
print(color1.str.strip())

print("\nRemove left sided whitespace")
print(color1.str.lstrip())

print("\nRemove Right sided whitespace")
print(color1.str.rstrip())

Original series:
Index([' Green', 'Black ', ' Red ', 'White', ' Pink '], dtype='object')

Remove whitespace
Index(['Green', 'Black', 'Red', 'White', 'Pink'], dtype='object')

Remove left sided whitespace
Index(['Green', 'Black ', 'Red ', 'White', 'Pink '], dtype='object')

Remove Right sided whitespace
Index([' Green', 'Black', ' Red', 'White', ' Pink'], dtype='object')


##### 3. Write a Pandas program to add leading zeros to the integer column in a pandas series and makes the length of the field to 8 digit.

In [35]:
sr = pd.Series(np.arange(1, 11))
sr.astype(str).str.zfill(8)

0    00000001
1    00000002
2    00000003
3    00000004
4    00000005
5    00000006
6    00000007
7    00000008
8    00000009
9    00000010
dtype: object

In [47]:
nums = {'amount': [10, 250, 3000, 40000, 500000]}

print("Original dataframe:")
df = pd.DataFrame(nums)
print(df)

print("\nAdd leading zeros:")
df['amount'] = df['amount'].apply(lambda x: f'{x:0>8}')
print(df)

Original dataframe:
   amount
0      10
1     250
2    3000
3   40000
4  500000

Add leading zeros:
     amount
0  00000010
1  00000250
2  00003000
3  00040000
4  00500000


In [36]:
nums = {'amount': [10, 250, 3000, 40000, 500000]}

print("Original dataframe:")
df = pd.DataFrame(nums)
print(df)

print("\nAdd leading zeros:")
df['amount'] = df['amount'].astype(str).str.zfill(8)
print(df)

Original dataframe:
   amount
0      10
1     250
2    3000
3   40000
4  500000

Add leading zeros:
     amount
0  00000010
1  00000250
2  00003000
3  00040000
4  00500000


In [104]:
def solution_1():
    return df['amount'].astype(str).str.zfill(8)

def solution_2():
    return df['amount'].apply(lambda x: '{0:0>8}'.format(x))

nums = {'amount': [10, 250, 3000, 40000, 500000]}

print("Original dataframe:")
df = pd.DataFrame(nums)
print(df)

test_solutions(solution_1, solution_2)

Original dataframe:
   amount
0      10
1     250
2    3000
3   40000
4  500000
Solution 1 Time: 0.031274 seconds
Solution 2 Time: 0.012792 seconds


##### 4. Write a Pandas program to add leading zeros to the character column in a pandas series and makes the length of the field to 8 digit.

In [55]:
df = pd.DataFrame({'char': ['a', 'b', 'c', 'd', 'e']})
df['char'] = df['char'].apply(lambda ch: f'{ch:0>8}')
print(df)

       char
0  0000000a
1  0000000b
2  0000000c
3  0000000d
4  0000000e


In [60]:
df = pd.DataFrame({'char': ['a', 'b', 'c', 'd', 'e']})
df['char'] = df['char'].str.zfill(8)
print(df)

       char
0  0000000a
1  0000000b
2  0000000c
3  0000000d
4  0000000e


In [54]:
nums = {'amount': ['10', '250', '3000', '40000', '500000']}
print("Original dataframe:")
df = pd.DataFrame(nums)
print(df)
print("\nAdd leading zeros:")
df['amount'] = list(map(lambda x: x.zfill(10), df['amount']))
print(df)

Original dataframe:
   amount
0      10
1     250
2    3000
3   40000
4  500000

Add leading zeros:
       amount
0  0000000010
1  0000000250
2  0000003000
3  0000040000
4  0000500000


In [105]:
def solution_1():
    return df['amount'].apply(lambda x: f'{x:0>8}')

# def solution_1():
#     return df['amount'].str.zfill(8)

def solution_2():    
    return list(map(lambda x: x.zfill(10), df['amount']))

nums = {'amount': ['10', '250', '3000', '40000', '500000']}
df = pd.DataFrame(nums)

test_solutions(solution_1, solution_2)

Solution 1 Time: 0.009750 seconds
Solution 2 Time: 0.000893 seconds


##### 5. Write a Pandas program to capitalize all the string values of specified columns of a given DataFrame.

In [111]:
df = pd.DataFrame({
    'col1': ['java', 'scala', 'python'],
    'col2': ['java', 'scala', 'python'],
    'col3': ['java', 'scala', 'python'],
})

df['col1'] = df['col1'].str.capitalize()
print(df)

     col1    col2    col3
0    Java    java    java
1   Scala   scala   scala
2  Python  python  python


In [112]:
df = pd.DataFrame({
    'name': ['alberto','gino','ryan', 'Eesha', 'syed'],
    'date_of_birth ': ['17/05/2002','16/02/1999','25/09/1998','11/05/2002','15/09/1997'],
    'age': [18.5, 21.2, 22.5, 22, 23]
})
print("Original DataFrame:")
print(df)

print("\nCapitalized DataFrame:")
df['name'] = list(map(lambda x: x.capitalize(), df.name))
print(df)

Original DataFrame:
      name date_of_birth    age
0  alberto     17/05/2002  18.5
1     gino     16/02/1999  21.2
2     ryan     25/09/1998  22.5
3    Eesha     11/05/2002  22.0
4     syed     15/09/1997  23.0

Capitalized DataFrame:
      name date_of_birth    age
0  Alberto     17/05/2002  18.5
1     Gino     16/02/1999  21.2
2     Ryan     25/09/1998  22.5
3    Eesha     11/05/2002  22.0
4     Syed     15/09/1997  23.0


In [None]:
def solution_1():
    df['name'] = df['name'].str.capitalize()

def solution_2():
    df['name'] = list(map(lambda x: x.capitalize(), df.name))
    
def create_df():
    df = pd.DataFrame({
        'name': ['alberto','gino','ryan', 'Eesha', 'syed'],
        'date_of_birth ': ['17/05/2002','16/02/1999','25/09/1998','11/05/2002','15/09/1997'],
        'age': [18.5, 21.2, 22.5, 22, 23]
    })

test_solutions(solution_1, solution_2, func = create_df)

In [182]:
def solution_1(df):
    df['name'] = df['name'].str.capitalize()

def solution_2(df):
    df['name'] = list(map(lambda x: x.capitalize(), df.name))
    
def create_df():
    count = 10000
    data = {
        'name': ['alberto','gino','ryan', 'Eesha', 'syed'] * count,
        'date_of_birth ': ['17/05/2002','16/02/1999','25/09/1998','11/05/2002','15/09/1997'] * count,
        'age': [18.5, 21.2, 22.5, 22, 23] * count
    }
    return pd.DataFrame(data)

test_solutions(solution_1, solution_2, func = create_df)

Solution 1 Time: 1.123047 seconds
Solution 2 Time: 2.233592 seconds


##### 6. Write a Pandas program to count of occurrence of a specified substring in a DataFrame column.

In [149]:
data = {
    'text_column': [
        'hello world',
        'world of pandas',
        'pandas are awesome',
        'hello again, pandas',
        'pandas are great',
        'welcome to the world of data science',
        'world of coding',
        'coding with pandas is fun',
        'hello hello hello',
        'pandas everywhere'
    ]
}

df = pd.DataFrame(data)

print(df)

count = df['text_column'].str.count('hello')
print(f'Count of substring "hello": {count.sum()}')

count = df['text_column'].str.count('world')
print(f'Count of substring "world": {count.sum()}')

df['count'] = list(map(lambda x: x.count('hello'), df['text_column']))
print(f'Count of substring "hello":')
print(df)

                            text_column
0                           hello world
1                       world of pandas
2                    pandas are awesome
3                   hello again, pandas
4                      pandas are great
5  welcome to the world of data science
6                       world of coding
7             coding with pandas is fun
8                     hello hello hello
9                     pandas everywhere
Count of substring "hello": 5
Count of substring "world": 4
Count of substring "hello":
                            text_column  count
0                           hello world      1
1                       world of pandas      0
2                    pandas are awesome      0
3                   hello again, pandas      1
4                      pandas are great      0
5  welcome to the world of data science      0
6                       world of coding      0
7             coding with pandas is fun      0
8                     hello hello hello      3
9 

In [144]:
df = pd.DataFrame({
    'name_code': ['c001','c002','c022', 'c2002', 'c2222'],
    'date_of_birth ': ['12/05/2002','16/02/1999','25/09/1998','12/02/2022','15/09/1997'],
    'age': [18.5, 21.2, 22.5, 22, 23]
})

print("Original DataFrame:")
print(df)

print("\nCount occurrence of 2 in date_of_birth column:")
df['count'] = list(map(lambda x: x.count("2"), df['name_code']))
print(df)

Original DataFrame:
  name_code date_of_birth    age
0      c001     12/05/2002  18.5
1      c002     16/02/1999  21.2
2      c022     25/09/1998  22.5
3     c2002     12/02/2022  22.0
4     c2222     15/09/1997  23.0

Count occurrence of 2 in date_of_birth column:
  name_code date_of_birth    age  count
0      c001     12/05/2002  18.5      0
1      c002     16/02/1999  21.2      1
2      c022     25/09/1998  22.5      2
3     c2002     12/02/2022  22.0      2
4     c2222     15/09/1997  23.0      4


In [199]:
data = {
    'text_column': [
        'hello world',
        'world of pandas',
        'pandas are awesome',
        'hello again, pandas',
        'pandas are great',
        'welcome to the world of data science',
        'world of coding',
        'coding with pandas is fun',
        'hello hello hello',
        'pandas everywhere'
    ] * 10000
}
    
df = pd.DataFrame(data)

def solution_1(df):
    df['text_column'] = df['text_column'].str.count('world')

def solution_2(df):
    df['text_column'] = list(map(lambda x: x.count("world"), df['text_column']))

test_solutions(solution_1, solution_2, df=df)

Solution 1 Time: 7.208842 seconds
Solution 2 Time: 7.240228 seconds


##### 7. Write a Pandas program to find the index of a given substring of a DataFrame column.

In [186]:
data = {
    'text_column': [
        'hello world',
        'world of pandas',
        'pandas are awesome',
        'hello again, pandas',
        'pandas are great',
        'welcome to the world of data science',
        'world of coding',
        'coding with pandas is fun',
        'hello hello hello',
        'pandas everywhere'
    ] * 1000
}

df = pd.DataFrame(data)

df['position'] = df['text_column'].str.find("world")
df

Unnamed: 0,text_column,position
0,hello world,6
1,world of pandas,0
2,pandas are awesome,-1
3,"hello again, pandas",-1
4,pandas are great,-1
...,...,...
9995,welcome to the world of data science,15
9996,world of coding,0
9997,coding with pandas is fun,-1
9998,hello hello hello,-1


In [191]:
def create_df():
    data = {
        'text_column': [
            'hello world',
            'world of pandas',
            'pandas are awesome',
            'hello again, pandas',
            'pandas are great',
            'welcome to the world of data science',
            'world of coding',
            'coding with pandas is fun',
            'hello hello hello',
            'pandas everywhere'
        ] * 100000
    }
    
    return pd.DataFrame(data)

def solution_1(df):
    df['text_column'] = df['text_column'].str.find("world")

def solution_2(df):
    df['text_column'] = list(map(lambda x: x.find("world"), df['text_column']))

test_solutions(solution_1, solution_2, func = create_df)

Solution 1 Time: 78.910252 seconds
Solution 2 Time: 70.021849 seconds


##### 8. Write a Pandas program to find the index of a substring of DataFrame with beginning and end position.

In [201]:
data = {
    'text_column': [
        'hello world',
        'world of pandas',
        'pandas are awesome',
        'hello again, pandas',
        'pandas are great',
        'welcome to the world of data science',
        'world of coding',
        'coding with pandas is fun',
        'hello hello hello',
        'pandas everywhere'
    ] * 1000
}

df = pd.DataFrame(data)

substr = "world"

df['position'] = df['text_column'].str.find(substr, start=0, end=5)

df

Unnamed: 0,text_column,start,end
0,hello world,-1,4
1,world of pandas,0,5
2,pandas are awesome,-1,4
3,"hello again, pandas",-1,4
4,pandas are great,-1,4
...,...,...,...
9995,welcome to the world of data science,-1,4
9996,world of coding,0,5
9997,coding with pandas is fun,-1,4
9998,hello hello hello,-1,4


In [203]:
data = {
    'text_column': [
        'hello world',
        'world of pandas',
        'pandas are awesome',
        'hello again, pandas',
        'pandas are great',
        'welcome to the world of data science',
        'world of coding',
        'coding with pandas is fun',
        'hello hello hello',
        'pandas everywhere'
    ] * 100000
}

df = pd.DataFrame(data)

def solution_1(df):
    df['position'] = df['text_column'].str.find("world", start=0, end=5)
    df

def solution_2(df):
    df['position'] = list(map(lambda x: x.find('world', 0, 5), df['text_column']))
    df

test_solutions(solution_1, solution_2, df = df)

Solution 1 Time: 79.331116 seconds
Solution 2 Time: 75.682947 seconds


##### 9. Write a Pandas program to check whether alpha numeric values present in a given column of a DataFrame.

In [215]:
data = {
    'col': ['one', '1', '2', 'second 2!', ' '] * 10000
}

df = pd.DataFrame(data)

def solution_1(df):
    df['result'] = df['col'].str.isalnum()
    return df

def solution_2(df):
    df['result'] = list(map(lambda s: s.isalnum(), df['col']))
    return df

print(solution_1(df.copy()))

print(solution_2(df.copy()))

test_solutions(solution_1, solution_2, df = df)

             col  result
0            one    True
1              1    True
2              2    True
3      second 2!   False
4                  False
...          ...     ...
49995        one    True
49996          1    True
49997          2    True
49998  second 2!   False
49999              False

[50000 rows x 2 columns]
             col  result
0            one    True
1              1    True
2              2    True
3      second 2!   False
4                  False
...          ...     ...
49995        one    True
49996          1    True
49997          2    True
49998  second 2!   False
49999              False

[50000 rows x 2 columns]
Solution 1 Time: 0.784256 seconds
Solution 2 Time: 1.877709 seconds


##### 10. Write a Pandas program to check whether alphabetic values present in a given column of a DataFrame.

In [216]:
data = {
    'col': ['one', '1', '2', 'second 2!', ' '] * 10000
}

df = pd.DataFrame(data)

def solution_1(df):
    df['result'] = df['col'].str.isalpha()
    return df

def solution_2(df):
    df['result'] = list(map(lambda s: s.isalpha(), df['col']))
    return df

print(solution_1(df.copy()))

print(solution_2(df.copy()))

test_solutions(solution_1, solution_2, df = df)

             col  result
0            one    True
1              1   False
2              2   False
3      second 2!   False
4                  False
...          ...     ...
49995        one    True
49996          1   False
49997          2   False
49998  second 2!   False
49999              False

[50000 rows x 2 columns]
             col  result
0            one    True
1              1   False
2              2   False
3      second 2!   False
4                  False
...          ...     ...
49995        one    True
49996          1   False
49997          2   False
49998  second 2!   False
49999              False

[50000 rows x 2 columns]
Solution 1 Time: 0.675793 seconds
Solution 2 Time: 1.761297 seconds


##### 11. Write a Pandas program to check whether only numeric values present in a given column of a DataFrame.

In [223]:
data = {
    'col': ['one', '1', '2', 'second 2!', ' '] * 1000
}

df = pd.DataFrame(data)

def solution_1(df):
    df['result'] = df['col'].str.isnumeric()
    return df

def solution_2(df):
    df['result'] = list(map(lambda s: s.isnumeric(), df['col']))
    return df

print(solution_1(df.copy()))

print(solution_2(df.copy()))

test_solutions(solution_1, solution_2, df = df)

            col  result
0           one   False
1             1    True
2             2    True
3     second 2!   False
4                 False
...         ...     ...
4995        one   False
4996          1    True
4997          2    True
4998  second 2!   False
4999              False

[5000 rows x 2 columns]
            col  result
0           one   False
1             1    True
2             2    True
3     second 2!   False
4                 False
...         ...     ...
4995        one   False
4996          1    True
4997          2    True
4998  second 2!   False
4999              False

[5000 rows x 2 columns]
Solution 1 Time: 0.166962 seconds
Solution 2 Time: 0.221855 seconds


##### 12. Write a Pandas program to check whether only lower case or upper case is present in a given column of a DataFrame.

In [229]:
data = {
    'col': ['l', 'U', 'lower', 'UPPER', 'Lower', 'Upper', 'LOWer', 'UPper', 'LOWer', 'UPper'] * 100
}

df = pd.DataFrame(data)

def solution_1(df):
    df['result'] = df['col'].str.islower()
    return df

def solution_2(df):
    df['result'] = list(map(lambda s: s.islower(), df['col']))
    return df

print(solution_1(df.copy()))

print(solution_2(df.copy()))

test_solutions(solution_1, solution_2, df = df)

       col  result
0        l    True
1        U   False
2    lower    True
3    UPPER   False
4    Lower   False
..     ...     ...
995  Upper   False
996  LOWer   False
997  UPper   False
998  LOWer   False
999  UPper   False

[1000 rows x 2 columns]
       col  result
0        l    True
1        U   False
2    lower    True
3    UPPER   False
4    Lower   False
..     ...     ...
995  Upper   False
996  LOWer   False
997  UPper   False
998  LOWer   False
999  UPper   False

[1000 rows x 2 columns]
Solution 1 Time: 0.088816 seconds
Solution 2 Time: 0.065051 seconds


##### 13. Write a Pandas program to check whether only proper case or title case is present in a given column of a DataFrame.

In [231]:
data = {
    'col': ['l', 'U', 'lower', 'UPPER', 'Lower', 'Upper', 'LOWer', 'UPper', 'LOWer', 'UPper'] * 1000
}

df = pd.DataFrame(data)

def solution_1(df):
    df['result'] = df['col'].str.istitle()
    return df

def solution_2(df):
    df['result'] = list(map(lambda s: s.istitle(), df['col']))
    return df

print(solution_1(df.copy()))

print(solution_2(df.copy()))

test_solutions(solution_1, solution_2, df = df)

        col  result
0         l   False
1         U    True
2     lower   False
3     UPPER   False
4     Lower    True
...     ...     ...
9995  Upper    True
9996  LOWer   False
9997  UPper   False
9998  LOWer   False
9999  UPper   False

[10000 rows x 2 columns]
        col  result
0         l   False
1         U    True
2     lower   False
3     UPPER   False
4     Lower    True
...     ...     ...
9995  Upper    True
9996  LOWer   False
9997  UPper   False
9998  LOWer   False
9999  UPper   False

[10000 rows x 2 columns]
Solution 1 Time: 0.200889 seconds
Solution 2 Time: 0.388222 seconds


##### 14. Write a Pandas program to check whether only space is present in a given column of a DataFrame.

In [245]:
data = {
    'col': ['l', 'U', 'lower', 'UPPER', 'Lower', 'Upper', ' LOWer', ' UPper', '  ', '  '] * 1000
}

df = pd.DataFrame(data)

def solution_1(df):
    df['result'] = df['col'].str.isspace()
    return df

def solution_2(df):
    df['result'] = list(map(lambda s: s.isspace(), df['col']))
    return df

print(solution_1(df.copy()))

print(solution_2(df.copy()))

test_solutions(solution_1, solution_2, df = df)

         col  result
0          l   False
1          U   False
2      lower   False
3      UPPER   False
4      Lower   False
...      ...     ...
9995   Upper   False
9996   LOWer   False
9997   UPper   False
9998            True
9999            True

[10000 rows x 2 columns]
         col  result
0          l   False
1          U   False
2      lower   False
3      UPPER   False
4      Lower   False
...      ...     ...
9995   Upper   False
9996   LOWer   False
9997   UPper   False
9998            True
9999            True

[10000 rows x 2 columns]
Solution 1 Time: 0.201814 seconds
Solution 2 Time: 0.404182 seconds


##### 15. Write a Pandas program to get the length of the string present of a given column in a DataFrame.

In [253]:
data = {
    'col': ['l', 'U', 'lower', 'UPPER', 'Lower', 'Upper', ' LOWer', ' UPper', '  ',  ' '] * 100
}

df = pd.DataFrame(data)

def solution_1(df):
    df['result'] = df['col'].str.len()
    return df

def solution_2(df):
    df['result'] = df['col'].apply(len)
    return df

print('Solution 1 result:')
print(solution_1(df.copy()))
print()
print('Solution 2 result:')
print(solution_2(df.copy()))

test_solutions(solution_1, solution_2, df = df)

Solution 1 result:
        col  result
0         l       1
1         U       1
2     lower       5
3     UPPER       5
4     Lower       5
..      ...     ...
995   Upper       5
996   LOWer       6
997   UPper       6
998               2
999               1

[1000 rows x 2 columns]
Solution 2 result:
        col  result
0         l       1
1         U       1
2     lower       5
3     UPPER       5
4     Lower       5
..      ...     ...
995   Upper       5
996   LOWer       6
997   UPper       6
998               2
999               1

[1000 rows x 2 columns]
Solution 1 Time: 0.123215 seconds
Solution 2 Time: 0.074916 seconds


##### 16. Write a Pandas program to get the length of the integer of a given column in a DataFrame.

In [262]:
data = {
    'col': [1, 2, 10, 20, 101, 201, 1002, 2002, 10003, 20003] * 1000
}

df = pd.DataFrame(data)

def solution_1(df):
    df['result'] = df['col'].astype(str).str.len()
    return df

def solution_2(df):
    df['result'] = df['col'].apply(str).str.len()
    return df

def solution_3(df):
    df['result'] = df['col'].apply(lambda i: len(str(i)))
    return df

def solution_4(df):
    df['result'] = list(map(lambda i: len(str(i)), df['col']))
    return df

print('Solution 1 result:')
print(solution_1(df.copy()))
print()
print('Solution 2 result:')
print(solution_2(df.copy()))
print()
print('Solution 3 result:')
print(solution_3(df.copy()))

print()
print('Solution 4 result:')
print(solution_4(df.copy()))

test_solutions(solution_1, solution_2, solution_3, solution_4, df = df)

Solution 1 result:
        col  result
0         1       1
1         2       1
2        10       2
3        20       2
4       101       3
...     ...     ...
9995    201       3
9996   1002       4
9997   2002       4
9998  10003       5
9999  20003       5

[10000 rows x 2 columns]

Solution 2 result:
        col  result
0         1       1
1         2       1
2        10       2
3        20       2
4       101       3
...     ...     ...
9995    201       3
9996   1002       4
9997   2002       4
9998  10003       5
9999  20003       5

[10000 rows x 2 columns]

Solution 3 result:
        col  result
0         1       1
1         2       1
2        10       2
3        20       2
4       101       3
...     ...     ...
9995    201       3
9996   1002       4
9997   2002       4
9998  10003       5
9999  20003       5

[10000 rows x 2 columns]

Solution 4 result:
        col  result
0         1       1
1         2       1
2        10       2
3        20       2
4       101       3
...

##### 17. Write a Pandas program to check if a specified column starts with a specified string in a DataFrame.

In [263]:
data = {
    'col': ['l', 'U', 'lower', 'UPPER', 'Lower', 'Upper', ' LOWer', ' UPper', '  ',  ' '] * 1000
}

df = pd.DataFrame(data)

def solution_1(df):
    df['result'] = df['col'].str.startswith('l')
    return df

def solution_2(df):
    df['result'] = list(map(lambda s: s.startswith('l') ,df['col']))
    return df

print('Solution 1 result:')
print(solution_1(df.copy()))
print()
print('Solution 2 result:')
print(solution_2(df.copy()))

test_solutions(solution_1, solution_2, df = df)

Solution 1 result:
         col  result
0          l    True
1          U   False
2      lower    True
3      UPPER   False
4      Lower   False
...      ...     ...
9995   Upper   False
9996   LOWer   False
9997   UPper   False
9998           False
9999           False

[10000 rows x 2 columns]

Solution 2 result:
         col  result
0          l    True
1          U   False
2      lower    True
3      UPPER   False
4      Lower   False
...      ...     ...
9995   Upper   False
9996   LOWer   False
9997   UPper   False
9998           False
9999           False

[10000 rows x 2 columns]
Solution 1 Time: 0.456194 seconds
Solution 2 Time: 0.513817 seconds


##### 18. Write a Pandas program to swap the cases of a specified character column in a given DataFrame.

In [264]:
data = {
    'col': ['l', 'U', 'lower', 'UPPER', 'Lower', 'Upper', ' LOWer', ' UPper', '  ',  ' '] * 1000
}

df = pd.DataFrame(data)

def solution_1(df):
    df['result'] = df['col'].str.swapcase()
    return df

def solution_2(df):
    df['result'] = list(map(lambda s: s.swapcase(), df['col']))
    return df

print('Solution 1 result:')
print(solution_1(df.copy()))
print()
print('Solution 2 result:')
print(solution_2(df.copy()))

test_solutions(solution_1, solution_2, df = df)

Solution 1 result:
         col  result
0          l       L
1          U       u
2      lower   LOWER
3      UPPER   upper
4      Lower   lOWER
...      ...     ...
9995   Upper   uPPER
9996   LOWer   lowER
9997   UPper   upPER
9998                
9999                

[10000 rows x 2 columns]

Solution 2 result:
         col  result
0          l       L
1          U       u
2      lower   LOWER
3      UPPER   upper
4      Lower   lOWER
...      ...     ...
9995   Upper   uPPER
9996   LOWer   lowER
9997   UPper   upPER
9998                
9999                

[10000 rows x 2 columns]
Solution 1 Time: 0.311847 seconds
Solution 2 Time: 0.534657 seconds


##### 19. Write a Pandas program to convert a specified character column in upper/lower cases in a given DataFrame.

In [272]:
data = {
    'col': ['l', 'U', 'lower', 'UPPER', 'Lower', 'Upper', ' LOWer', ' UPper', '  ',  ' '] * 1000
}

df = pd.DataFrame(data)

def solution_1(df):
    df['result'] = df['col'].str.lower()
    return df

def solution_2(df):
    df['result'] = list(map(lambda s: s.lower(), df['col']))
    return df

print('Solution 1 result:')
print(solution_1(df.copy()))
print()
print('Solution 2 result:')
print(solution_2(df.copy()))

test_solutions(solution_1, solution_2, df = df)

Solution 1 result:
         col  result
0          l       l
1          U       u
2      lower   lower
3      UPPER   upper
4      Lower   lower
...      ...     ...
9995   Upper   upper
9996   LOWer   lower
9997   UPper   upper
9998                
9999                

[10000 rows x 2 columns]

Solution 2 result:
         col  result
0          l       l
1          U       u
2      lower   lower
3      UPPER   upper
4      Lower   lower
...      ...     ...
9995   Upper   upper
9996   LOWer   lower
9997   UPper   upper
9998                
9999                

[10000 rows x 2 columns]
Solution 1 Time: 0.215068 seconds
Solution 2 Time: 0.392519 seconds


##### 20. Write a Pandas program to convert a specified character column in title case in a given DataFrame.

In [273]:
data = {
    'col': ['l', 'U', 'lower', 'UPPER', 'Lower', 'Upper', ' LOWer', ' UPper', '  ',  ' '] * 1000
}

df = pd.DataFrame(data)

def solution_1(df):
    df['result'] = df['col'].str.title()
    return df

def solution_2(df):
    df['result'] = list(map(lambda s: s.title(), df['col']))
    return df

print('Solution 1 result:')
print(solution_1(df.copy()))
print()
print('Solution 2 result:')
print(solution_2(df.copy()))

test_solutions(solution_1, solution_2, df = df)

Solution 1 result:
         col  result
0          l       L
1          U       U
2      lower   Lower
3      UPPER   Upper
4      Lower   Lower
...      ...     ...
9995   Upper   Upper
9996   LOWer   Lower
9997   UPper   Upper
9998                
9999                

[10000 rows x 2 columns]

Solution 2 result:
         col  result
0          l       L
1          U       U
2      lower   Lower
3      UPPER   Upper
4      Lower   Lower
...      ...     ...
9995   Upper   Upper
9996   LOWer   Lower
9997   UPper   Upper
9998                
9999                

[10000 rows x 2 columns]
Solution 1 Time: 0.270474 seconds
Solution 2 Time: 0.433619 seconds


##### 21. Write a Pandas program to replace arbitrary values with other values in a given DataFrame.

In [285]:
data = {
    'col': ['l', 'U', 'lower', 'UPPER', 'Lower', 'Upper', ' LOWer', ' UPper', '  ',  ' '] * 1000000
}

df = pd.DataFrame(data)

def solution_1(df):
    df['result'] = df['col'].str.replace('l', 'л')
    return df

def solution_2(df):
    df['result'] = list(map(lambda s: s.replace('l', 'л'), df['col']))
    return df

def solution_3(df):
    df['result'] = df['col'].str.replace('l', 'л').str.replace('L', 'л')
    return df

def solution_4(df):
    df['result'] = list(map(lambda s: s.replace('l', 'л').replace('L', 'л'), df['col']))
    return df

print('Solution 1 result:')
print(solution_1(df.copy()))
print()
print('Solution 2 result:')
print(solution_2(df.copy()))
print()
print('Solution 3 result:')
print(solution_3(df.copy()))
print()
print('Solution 4 result:')
print(solution_4(df.copy()))

test_solutions(solution_1, solution_2, solution_3, solution_4, df = df)

Solution 1 result:
            col  result
0             l       л
1             U       U
2         lower   лower
3         UPPER   UPPER
4         Lower   Lower
...         ...     ...
9999995   Upper   Upper
9999996   LOWer   LOWer
9999997   UPper   UPper
9999998                
9999999                

[10000000 rows x 2 columns]

Solution 2 result:
            col  result
0             l       л
1             U       U
2         lower   лower
3         UPPER   UPPER
4         Lower   Lower
...         ...     ...
9999995   Upper   Upper
9999996   LOWer   LOWer
9999997   UPper   UPper
9999998                
9999999                

[10000000 rows x 2 columns]

Solution 3 result:
            col  result
0             l       л
1             U       U
2         lower   лower
3         UPPER   UPPER
4         Lower   лower
...         ...     ...
9999995   Upper   Upper
9999996   LOWer   лOWer
9999997   UPper   UPper
9999998                
9999999                

[10000000 rows x 2

##### 22. Write a Pandas program to replace more than one value with other values in a given DataFrame.

In [291]:
data = {
    'col': ['l', 'U', 'lower', 'UPPER', 'Lower', 'Upper', ' LOWer', ' UPper', '  ',  ' '] * 1000000
}

df = pd.DataFrame(data)

def solution_1(df):
    df['result'] = df['col'].replace(['l', 'U'], ['л', 'В'])
    return df

print('Solution 1 result:')
print(solution_1(df.copy()))

test_solutions(solution_1, df = df)

Solution 1 result:
            col  result
0             l       л
1             U       В
2         lower   lower
3         UPPER   UPPER
4         Lower   Lower
...         ...     ...
9999995   Upper   Upper
9999996   LOWer   LOWer
9999997   UPper   UPper
9999998                
9999999                

[10000000 rows x 2 columns]
Solution 1 Time: 1.187795 seconds


##### 23. Write a Pandas program to split a string of a column of a given DataFrame into multiple columns.

In [300]:
count = 1000
data = {
    'full name': [
        'Alberto Franco',
        'Gino Mcneill',
        'Ryan Parkes',
        'Eesha Hinton',
        'Syed Khan'
    ] * count,
    'date of birth': [
        '15-August-2002',
        '17-May-2002',
        '16-February-1999',
        '25-September-1998',
        '11-May-2002'
    ] * count
}

df = pd.DataFrame(data)

print(df)

def solution_1(df):
    df[['day', 'month', 'year']] = df['date of birth'].str.split('-', expand=True)
    return df

def solution_2(df):
    df['result'] = df['date of birth'].str.split('-')
    return df

def solution_3(df):
    df[['day', 'month', 'year']] = list(map(lambda s: s.split('-'), df['date of birth']))
    return df

print('Solution 1 result:')
print(solution_1(df.copy()))
print()
print('Solution 2 result:')
print(solution_2(df.copy()))
print()
print('Solution 3 result:')
print(solution_3(df.copy()))

test_solutions(solution_1, solution_2, solution_3, df = df)

           full name      date of birth
0     Alberto Franco     15-August-2002
1       Gino Mcneill        17-May-2002
2        Ryan Parkes   16-February-1999
3       Eesha Hinton  25-September-1998
4          Syed Khan        11-May-2002
...              ...                ...
4995  Alberto Franco     15-August-2002
4996    Gino Mcneill        17-May-2002
4997     Ryan Parkes   16-February-1999
4998    Eesha Hinton  25-September-1998
4999       Syed Khan        11-May-2002

[5000 rows x 2 columns]
Solution 1 result:
           full name      date of birth day      month  year
0     Alberto Franco     15-August-2002  15     August  2002
1       Gino Mcneill        17-May-2002  17        May  2002
2        Ryan Parkes   16-February-1999  16   February  1999
3       Eesha Hinton  25-September-1998  25  September  1998
4          Syed Khan        11-May-2002  11        May  2002
...              ...                ...  ..        ...   ...
4995  Alberto Franco     15-August-2002  15     A

##### 24. Write a Pandas program to extract email from a specified column of string type of a given DataFrame.

In [315]:
count = 1000
data = {
    'Full Name': [
        'John Doe', 
        'Jane Smith', 
        'Michael Johnson', 
        'Sarah Williams', 
        'David Brown'
    ] * count,
    'Date of Birth': [
        '15-March-1990', 
        '22-April-1985', 
        '30-May-1978', 
        '12-September-1992', 
        '05-July-1983'
    ] * count,
    'Message': [
        'Please contact us at john.doe@example.com.',
        'Reach out via email: jane.smith@example.com.',
        'For inquiries, email mike.johnson@example.com.',
        'Email: sarah.williams@example.com for more details.',
        'Get in touch at david.brown@example.com.'
    ] * count
}

df = pd.DataFrame(data)
display(df)

def solution_1(df):
    df['email'] = df['Message'].str.extract(r'([a-zA-Z0-9_.±]+@[a-zA-Z0-9-]+.[a-zA-Z0-9-]+)')
    return df

def solution_2(df):
    import re
    df['email'] = list(map(lambda s: re.search('([a-zA-Z0-9_.±]+@[a-zA-Z0-9-]+.[a-zA-Z0-9-]+)', s).group(1), df['Message']))
    return df

print('Solution 1 result:')
display(solution_1(df.copy()))
print()
print('Solution 2 result:')
display(solution_2(df.copy()))


test_solutions(solution_1, solution_2, df=df)

Unnamed: 0,Full Name,Date of Birth,Message
0,John Doe,15-March-1990,Please contact us at john.doe@example.com.
1,Jane Smith,22-April-1985,Reach out via email: jane.smith@example.com.
2,Michael Johnson,30-May-1978,"For inquiries, email mike.johnson@example.com."
3,Sarah Williams,12-September-1992,Email: sarah.williams@example.com for more det...
4,David Brown,05-July-1983,Get in touch at david.brown@example.com.
...,...,...,...
4995,John Doe,15-March-1990,Please contact us at john.doe@example.com.
4996,Jane Smith,22-April-1985,Reach out via email: jane.smith@example.com.
4997,Michael Johnson,30-May-1978,"For inquiries, email mike.johnson@example.com."
4998,Sarah Williams,12-September-1992,Email: sarah.williams@example.com for more det...


Solution 1 result:


Unnamed: 0,Full Name,Date of Birth,Message,email
0,John Doe,15-March-1990,Please contact us at john.doe@example.com.,john.doe@example.com
1,Jane Smith,22-April-1985,Reach out via email: jane.smith@example.com.,jane.smith@example.com
2,Michael Johnson,30-May-1978,"For inquiries, email mike.johnson@example.com.",mike.johnson@example.com
3,Sarah Williams,12-September-1992,Email: sarah.williams@example.com for more det...,sarah.williams@example.com
4,David Brown,05-July-1983,Get in touch at david.brown@example.com.,david.brown@example.com
...,...,...,...,...
4995,John Doe,15-March-1990,Please contact us at john.doe@example.com.,john.doe@example.com
4996,Jane Smith,22-April-1985,Reach out via email: jane.smith@example.com.,jane.smith@example.com
4997,Michael Johnson,30-May-1978,"For inquiries, email mike.johnson@example.com.",mike.johnson@example.com
4998,Sarah Williams,12-September-1992,Email: sarah.williams@example.com for more det...,sarah.williams@example.com



Solution 2 result:


Unnamed: 0,Full Name,Date of Birth,Message,email
0,John Doe,15-March-1990,Please contact us at john.doe@example.com.,john.doe@example.com
1,Jane Smith,22-April-1985,Reach out via email: jane.smith@example.com.,jane.smith@example.com
2,Michael Johnson,30-May-1978,"For inquiries, email mike.johnson@example.com.",mike.johnson@example.com
3,Sarah Williams,12-September-1992,Email: sarah.williams@example.com for more det...,sarah.williams@example.com
4,David Brown,05-July-1983,Get in touch at david.brown@example.com.,david.brown@example.com
...,...,...,...,...
4995,John Doe,15-March-1990,Please contact us at john.doe@example.com.,john.doe@example.com
4996,Jane Smith,22-April-1985,Reach out via email: jane.smith@example.com.,jane.smith@example.com
4997,Michael Johnson,30-May-1978,"For inquiries, email mike.johnson@example.com.",mike.johnson@example.com
4998,Sarah Williams,12-September-1992,Email: sarah.williams@example.com for more det...,sarah.williams@example.com


Solution 1 Time: 1.200988 seconds
Solution 2 Time: 1.482690 seconds


##### 25. Write a Pandas program to extract hash attached word from twitter text from the specified column of a given DataFrame.

In [358]:
count = 1000
data = {
    'Tweet': [
        "Just finished a great workout! #fitness #healthyliving",
        "Amazing views from the top of the mountain! #nature #hiking",
        "Can't wait for the weekend! #TGIF #relaxation",
        "New blog post is up! Check it out. #blogging #contentcreator",
        "Exploring the city today! #travel #adventure"
    ] * count
}

df = pd.DataFrame(data)
print(df)

def solution_1(df):
    df['hashtags'] = df['Tweet'].str.findall(r'#(\w+)').apply(lambda tags: ', '.join(tags))
    return df

def solution_2(df):
    import re
    hashtags = list(map(lambda s: re.findall(r'#(\w+)', s), df['Tweet']))
    df['hashtags'] = [', '.join(tags) for tags in hashtags]
    return df

print('Solution 1 result:')
display(solution_1(df.copy()))
print()
print('Solution 2 result:')
display(solution_2(df.copy()))


test_solutions(solution_1, solution_2, df=df)

                                                  Tweet
0     Just finished a great workout! #fitness #healt...
1     Amazing views from the top of the mountain! #n...
2         Can't wait for the weekend! #TGIF #relaxation
3     New blog post is up! Check it out. #blogging #...
4          Exploring the city today! #travel #adventure
...                                                 ...
4995  Just finished a great workout! #fitness #healt...
4996  Amazing views from the top of the mountain! #n...
4997      Can't wait for the weekend! #TGIF #relaxation
4998  New blog post is up! Check it out. #blogging #...
4999       Exploring the city today! #travel #adventure

[5000 rows x 1 columns]
Solution 1 result:


Unnamed: 0,Tweet,hashtags
0,Just finished a great workout! #fitness #healt...,"fitness, healthyliving"
1,Amazing views from the top of the mountain! #n...,"nature, hiking"
2,Can't wait for the weekend! #TGIF #relaxation,"TGIF, relaxation"
3,New blog post is up! Check it out. #blogging #...,"blogging, contentcreator"
4,Exploring the city today! #travel #adventure,"travel, adventure"
...,...,...
4995,Just finished a great workout! #fitness #healt...,"fitness, healthyliving"
4996,Amazing views from the top of the mountain! #n...,"nature, hiking"
4997,Can't wait for the weekend! #TGIF #relaxation,"TGIF, relaxation"
4998,New blog post is up! Check it out. #blogging #...,"blogging, contentcreator"



Solution 2 result:


Unnamed: 0,Tweet,hashtags
0,Just finished a great workout! #fitness #healt...,"fitness, healthyliving"
1,Amazing views from the top of the mountain! #n...,"nature, hiking"
2,Can't wait for the weekend! #TGIF #relaxation,"TGIF, relaxation"
3,New blog post is up! Check it out. #blogging #...,"blogging, contentcreator"
4,Exploring the city today! #travel #adventure,"travel, adventure"
...,...,...
4995,Just finished a great workout! #fitness #healt...,"fitness, healthyliving"
4996,Amazing views from the top of the mountain! #n...,"nature, hiking"
4997,Can't wait for the weekend! #TGIF #relaxation,"TGIF, relaxation"
4998,New blog post is up! Check it out. #blogging #...,"blogging, contentcreator"


Solution 1 Time: 0.779036 seconds
Solution 2 Time: 1.258239 seconds


##### 26. Write a Pandas program to extract word mention someone in tweets using @ from the specified column of a given DataFrame.

In [363]:
import re
count = 1000
data = {
    'Tweet': [
        'Great meeting with @john_doe and @jane_smith today!',
        '@mike_jones and @susan_williams are working on the project.',
        'Had a wonderful lunch with @alice_brown and @bob_white.',
        'Looking forward to the conference with @mary_johnson.',
        'Shoutout to @charlie_green for the amazing support!',
        'Follow @tech_guru for the latest tech updates.',
        'Excited for the upcoming event with @emily_davis and @daniel_clark.'
    ] * 1000
}

df = pd.DataFrame(data)
print(df)

def solution_1(df):
    df['mention'] = df['Tweet'].str.findall(r'(?<=@)\w+').apply(lambda tags: ', '.join(tags))
    return df

def solution_2(df):    
    def exctract_mention(tweet):
        mentions = re.findall(r'(?<=@)\w+', tweet)
        return ', '.join(mentions)    
    df['mention'] = df['Tweet'].apply(exctract_mention)
    return df

print('Solution 1 result:')
display(solution_1(df.copy()))
print()
print('Solution 2 result:')
display(solution_2(df.copy()))


test_solutions(solution_1, solution_2, df=df)

                                                  Tweet
0     Great meeting with @john_doe and @jane_smith t...
1     @mike_jones and @susan_williams are working on...
2     Had a wonderful lunch with @alice_brown and @b...
3     Looking forward to the conference with @mary_j...
4     Shoutout to @charlie_green for the amazing sup...
...                                                 ...
6995  Had a wonderful lunch with @alice_brown and @b...
6996  Looking forward to the conference with @mary_j...
6997  Shoutout to @charlie_green for the amazing sup...
6998     Follow @tech_guru for the latest tech updates.
6999  Excited for the upcoming event with @emily_dav...

[7000 rows x 1 columns]
Solution 1 result:


Unnamed: 0,Tweet,mention
0,Great meeting with @john_doe and @jane_smith t...,"john_doe, jane_smith"
1,@mike_jones and @susan_williams are working on...,"mike_jones, susan_williams"
2,Had a wonderful lunch with @alice_brown and @b...,"alice_brown, bob_white"
3,Looking forward to the conference with @mary_j...,mary_johnson
4,Shoutout to @charlie_green for the amazing sup...,charlie_green
...,...,...
6995,Had a wonderful lunch with @alice_brown and @b...,"alice_brown, bob_white"
6996,Looking forward to the conference with @mary_j...,mary_johnson
6997,Shoutout to @charlie_green for the amazing sup...,charlie_green
6998,Follow @tech_guru for the latest tech updates.,tech_guru



Solution 2 result:


Unnamed: 0,Tweet,mention
0,Great meeting with @john_doe and @jane_smith t...,"john_doe, jane_smith"
1,@mike_jones and @susan_williams are working on...,"mike_jones, susan_williams"
2,Had a wonderful lunch with @alice_brown and @b...,"alice_brown, bob_white"
3,Looking forward to the conference with @mary_j...,mary_johnson
4,Shoutout to @charlie_green for the amazing sup...,charlie_green
...,...,...
6995,Had a wonderful lunch with @alice_brown and @b...,"alice_brown, bob_white"
6996,Looking forward to the conference with @mary_j...,mary_johnson
6997,Shoutout to @charlie_green for the amazing sup...,charlie_green
6998,Follow @tech_guru for the latest tech updates.,tech_guru


Solution 1 Time: 1.660263 seconds
Solution 2 Time: 1.957392 seconds


##### 27. Write a Pandas program to extract only number from the specified column of a given DataFrame.

In [3]:
import re
count = 1000000
data = {
    'text': [
        'Number 100',
        'Sum of total is $5500',
        'Other number 777',
        '800 number and other number 950',
        '1000538'
    ] * count
}

df = pd.DataFrame(data)
display(df)

def solution_1(df):
    df['numbers'] = df['text'].str.findall(r'\d+').apply(lambda tags: ', '.join(tags))
    return df

def solution_2(df):    
    def exctract_numbers(text):
        numbers = re.findall(r'\d+', text)
        return ', '.join(numbers)    
    df['numbers'] = df['text'].apply(exctract_numbers)
    return df

print('Solution 1 result:')
display(solution_1(df.copy()))
print()
print('Solution 2 result:')
display(solution_2(df.copy()))


test_solutions(solution_1, solution_2, df=df)

Unnamed: 0,text
0,Number 100
1,Sum of total is $5500
2,Other number 777
3,800 number and other number 950
4,1000538
...,...
4999995,Number 100
4999996,Sum of total is $5500
4999997,Other number 777
4999998,800 number and other number 950


Solution 1 result:


Unnamed: 0,text,numbers
0,Number 100,100
1,Sum of total is $5500,5500
2,Other number 777,777
3,800 number and other number 950,"800, 950"
4,1000538,1000538
...,...,...
4999995,Number 100,100
4999996,Sum of total is $5500,5500
4999997,Other number 777,777
4999998,800 number and other number 950,"800, 950"



Solution 2 result:


Unnamed: 0,text,numbers
0,Number 100,100
1,Sum of total is $5500,5500
2,Other number 777,777
3,800 number and other number 950,"800, 950"
4,1000538,1000538
...,...,...
4999995,Number 100,100
4999996,Sum of total is $5500,5500
4999997,Other number 777,777
4999998,800 number and other number 950,"800, 950"


Solution 1 Time: 5.717723 seconds
Solution 2 Time: 10.017959 seconds


##### 28. Write a Pandas program to extract only phone number from the specified column of a given DataFrame.

In [13]:
count = 200000
data = {
    'Name': ['John Doe', 'Jane Smith', 'Mike Johnson', 'Emily Davis', 'David Brown'] * count,
    'Contact_Info': [
        'Call me at 123-456-7890',
        'Reach me on 987.654.3210 or email at jane.smith@example.com',
        'Phone: (555) 123-4567, Office: 1234-567-890',
        'Mobile: 555-987-6543, Work: 555.654.3210',
        'Contact: +1-800-555-1234 for more details'
    ] * count,
    'Address': [
        '123 Elm Street, Springfield',
        '456 Maple Avenue, Metropolis',
        '789 Oak Lane, Gotham',
        '321 Pine Road, Star City',
        '654 Cedar Boulevard, Central City'
    ] * count
}

df = pd.DataFrame(data)

display(df)

# regex = r'(\+?\d{1,2}[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}'
regex = r'(?:\+?\d{1,2}[-.\s]?)?(?:\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}'

def solution_1(df):
    df['Phone numbers'] = df['Contact_Info'].str.findall(regex).apply(lambda numbers: ', '.join(numbers))
    return df

def solution_2(df):    
    def exctract_numbers(text):
        numbers = re.findall(regex, text)
        return ', '.join(numbers)    
    df['Phone numbers'] = df['Contact_Info'].apply(exctract_numbers)
    return df

print('Solution 1 result:')
display(solution_1(df.copy()))
print()
print('Solution 2 result:')
display(solution_2(df.copy()))


test_solutions(solution_1, solution_2, df=df)

Unnamed: 0,Name,Contact_Info,Address
0,John Doe,Call me at 123-456-7890,"123 Elm Street, Springfield"
1,Jane Smith,Reach me on 987.654.3210 or email at jane.smit...,"456 Maple Avenue, Metropolis"
2,Mike Johnson,"Phone: (555) 123-4567, Office: 1234-567-890","789 Oak Lane, Gotham"
3,Emily Davis,"Mobile: 555-987-6543, Work: 555.654.3210","321 Pine Road, Star City"
4,David Brown,Contact: +1-800-555-1234 for more details,"654 Cedar Boulevard, Central City"
...,...,...,...
999995,John Doe,Call me at 123-456-7890,"123 Elm Street, Springfield"
999996,Jane Smith,Reach me on 987.654.3210 or email at jane.smit...,"456 Maple Avenue, Metropolis"
999997,Mike Johnson,"Phone: (555) 123-4567, Office: 1234-567-890","789 Oak Lane, Gotham"
999998,Emily Davis,"Mobile: 555-987-6543, Work: 555.654.3210","321 Pine Road, Star City"


Solution 1 result:


Unnamed: 0,Name,Contact_Info,Address,Phone numbers
0,John Doe,Call me at 123-456-7890,"123 Elm Street, Springfield",123-456-7890
1,Jane Smith,Reach me on 987.654.3210 or email at jane.smit...,"456 Maple Avenue, Metropolis",987.654.3210
2,Mike Johnson,"Phone: (555) 123-4567, Office: 1234-567-890","789 Oak Lane, Gotham",(555) 123-4567
3,Emily Davis,"Mobile: 555-987-6543, Work: 555.654.3210","321 Pine Road, Star City","555-987-6543, 555.654.3210"
4,David Brown,Contact: +1-800-555-1234 for more details,"654 Cedar Boulevard, Central City",+1-800-555-1234
...,...,...,...,...
999995,John Doe,Call me at 123-456-7890,"123 Elm Street, Springfield",123-456-7890
999996,Jane Smith,Reach me on 987.654.3210 or email at jane.smit...,"456 Maple Avenue, Metropolis",987.654.3210
999997,Mike Johnson,"Phone: (555) 123-4567, Office: 1234-567-890","789 Oak Lane, Gotham",(555) 123-4567
999998,Emily Davis,"Mobile: 555-987-6543, Work: 555.654.3210","321 Pine Road, Star City","555-987-6543, 555.654.3210"



Solution 2 result:


Unnamed: 0,Name,Contact_Info,Address,Phone numbers
0,John Doe,Call me at 123-456-7890,"123 Elm Street, Springfield",123-456-7890
1,Jane Smith,Reach me on 987.654.3210 or email at jane.smit...,"456 Maple Avenue, Metropolis",987.654.3210
2,Mike Johnson,"Phone: (555) 123-4567, Office: 1234-567-890","789 Oak Lane, Gotham",(555) 123-4567
3,Emily Davis,"Mobile: 555-987-6543, Work: 555.654.3210","321 Pine Road, Star City","555-987-6543, 555.654.3210"
4,David Brown,Contact: +1-800-555-1234 for more details,"654 Cedar Boulevard, Central City",+1-800-555-1234
...,...,...,...,...
999995,John Doe,Call me at 123-456-7890,"123 Elm Street, Springfield",123-456-7890
999996,Jane Smith,Reach me on 987.654.3210 or email at jane.smit...,"456 Maple Avenue, Metropolis",987.654.3210
999997,Mike Johnson,"Phone: (555) 123-4567, Office: 1234-567-890","789 Oak Lane, Gotham",(555) 123-4567
999998,Emily Davis,"Mobile: 555-987-6543, Work: 555.654.3210","321 Pine Road, Star City","555-987-6543, 555.654.3210"


Solution 1 Time: 7.872349 seconds
Solution 2 Time: 8.630942 seconds


##### 29. Write a Pandas program to extract year between 1800 to 2200 from the specified column of a given DataFrame.

In [19]:
count = 1000000
data = {
    'Text': [
        'The event took place in 1999 and was remarkable.',
        'In the year 2021, something significant happened.',
        'He was born in 1856 and lived a long life.',
        'They predict that by 2100, technology will be far advanced.',
        'The painting was created in 1821 and restored in 2015.',
        'No specific year is mentioned here.',
        'An ancient artifact from 1776 was discovered.',
        'The year 2200 seems like a distant future.',
        'In 1800, the world was a different place.',
        'Will 2199 be a year of change?'
    ] * count
}

df = pd.DataFrame(data)

display(df)

regex = r'\b(?:18|19|20|21)\d{2}|2200\b'

def solution_1(df):
    df['Years'] = df['Text'].str.findall(regex)
    return df

def solution_2(df):    
    def exctract_numbers(text):
        return re.findall(regex, text)
    df['Years'] = df['Text'].apply(exctract_numbers)
    return df

print('Solution 1 result:')
display(solution_1(df.copy()))
print()
print('Solution 2 result:')
display(solution_2(df.copy()))


test_solutions(solution_1, solution_2, df=df)

Unnamed: 0,Text
0,The event took place in 1999 and was remarkable.
1,"In the year 2021, something significant happened."
2,He was born in 1856 and lived a long life.
3,"They predict that by 2100, technology will be ..."
4,The painting was created in 1821 and restored ...
...,...
9999995,No specific year is mentioned here.
9999996,An ancient artifact from 1776 was discovered.
9999997,The year 2200 seems like a distant future.
9999998,"In 1800, the world was a different place."


Solution 1 result:


Unnamed: 0,Text,Years
0,The event took place in 1999 and was remarkable.,[1999]
1,"In the year 2021, something significant happened.",[2021]
2,He was born in 1856 and lived a long life.,[1856]
3,"They predict that by 2100, technology will be ...",[2100]
4,The painting was created in 1821 and restored ...,"[1821, 2015]"
...,...,...
9999995,No specific year is mentioned here.,[]
9999996,An ancient artifact from 1776 was discovered.,[]
9999997,The year 2200 seems like a distant future.,[2200]
9999998,"In 1800, the world was a different place.",[1800]



Solution 2 result:


Unnamed: 0,Text,Years
0,The event took place in 1999 and was remarkable.,[1999]
1,"In the year 2021, something significant happened.",[2021]
2,He was born in 1856 and lived a long life.,[1856]
3,"They predict that by 2100, technology will be ...",[2100]
4,The painting was created in 1821 and restored ...,"[1821, 2015]"
...,...,...
9999995,No specific year is mentioned here.,[]
9999996,An ancient artifact from 1776 was discovered.,[]
9999997,The year 2200 seems like a distant future.,[2200]
9999998,"In 1800, the world was a different place.",[1800]


Solution 1 Time: 23.491261 seconds
Solution 2 Time: 34.561925 seconds


##### 30. Write a Pandas program to extract only non alphanumeric characters from the specified column of a given DataFrame.

In [23]:
count = 1000
data = {
    'Text': [
        '#The event took $place in 1999 and was remarkable.',
        'In the year 2021, something significant happened.',
        'He was born in 1856 and lived a long life.',
        'They predict that ^by 2100, technology will be far advanced.',
        'The painting was &created in 1821 and restored in 2015.',
        'No specific year is mentioned here.',
        'An ancient artifac**t from 1776 was discovered.',
        'The year 2200 seems like a distant future.',
        'In 1800, the world was a different place.',
        'Will 2199 be a year of change?'
    ] * count
}

df = pd.DataFrame(data)

display(df)

regex = r'\W'

def solution_1(df):
    df['result'] = df['Text'].str.findall(regex)
    return df

def solution_2(df):    
    def exctract_values(text):
        return re.findall(regex, text)
    df['result'] = df['Text'].apply(exctract_values)
    return df

print('Solution 1 result:')
display(solution_1(df.copy()))
print()
print('Solution 2 result:')
display(solution_2(df.copy()))


test_solutions(solution_1, solution_2, df=df)

Unnamed: 0,Text
0,#The event took $place in 1999 and was remarka...
1,"In the year 2021, something significant happened."
2,He was born in 1856 and lived a long life.
3,"They predict that ^by 2100, technology will be..."
4,The painting was &created in 1821 and restored...
...,...
9995,No specific year is mentioned here.
9996,An ancient artifac**t from 1776 was discovered.
9997,The year 2200 seems like a distant future.
9998,"In 1800, the world was a different place."


Solution 1 result:


Unnamed: 0,Text,result
0,#The event took $place in 1999 and was remarka...,"[#, , , , $, , , , , , .]"
1,"In the year 2021, something significant happened.","[ , , , ,, , , , .]"
2,He was born in 1856 and lived a long life.,"[ , , , , , , , , , .]"
3,"They predict that ^by 2100, technology will be...","[ , , , ^, , ,, , , , , , .]"
4,The painting was &created in 1821 and restored...,"[ , , , &, , , , , , , .]"
...,...,...
9995,No specific year is mentioned here.,"[ , , , , , .]"
9996,An ancient artifac**t from 1776 was discovered.,"[ , , *, *, , , , , .]"
9997,The year 2200 seems like a distant future.,"[ , , , , , , , .]"
9998,"In 1800, the world was a different place.","[ , ,, , , , , , , .]"



Solution 2 result:


Unnamed: 0,Text,result
0,#The event took $place in 1999 and was remarka...,"[#, , , , $, , , , , , .]"
1,"In the year 2021, something significant happened.","[ , , , ,, , , , .]"
2,He was born in 1856 and lived a long life.,"[ , , , , , , , , , .]"
3,"They predict that ^by 2100, technology will be...","[ , , , ^, , ,, , , , , , .]"
4,The painting was &created in 1821 and restored...,"[ , , , &, , , , , , , .]"
...,...,...
9995,No specific year is mentioned here.,"[ , , , , , .]"
9996,An ancient artifac**t from 1776 was discovered.,"[ , , *, *, , , , , .]"
9997,The year 2200 seems like a distant future.,"[ , , , , , , , .]"
9998,"In 1800, the world was a different place.","[ , ,, , , , , , , .]"


Solution 1 Time: 0.026970 seconds
Solution 2 Time: 0.035796 seconds


##### 31. Write a Pandas program to extract only punctuations from the specified column of a given DataFrame.

In [25]:
count = 1000
data = {
    'Text': [
        '#The event took $place in 1999 and was remarkable.',
        'In the year 2021, something significant happened.',
        'He was born in 1856 and lived a long life.',
        'They predict that ^by 2100, technology will be far advanced.',
        'The painting was &created in 1821 and restored in 2015.',
        'No specific year is mentioned here.',
        'An ancient artifac**t from 1776 was discovered.',
        'The year 2200 seems like a distant future.',
        'In 1800, the world was a different place.',
        'Will 2199 be a year of change?'
    ] * count
}

df = pd.DataFrame(data)

display(df)

regex = r'[^\w\s]'

def solution_1(df):
    df['result'] = df['Text'].str.findall(regex)
    return df

def solution_2(df):    
    def exctract_values(text):
        return re.findall(regex, text)
    df['result'] = df['Text'].apply(exctract_values)
    return df

print('Solution 1 result:')
display(solution_1(df.copy()))
print()
print('Solution 2 result:')
display(solution_2(df.copy()))


test_solutions(solution_1, solution_2, df=df)

Unnamed: 0,Text
0,#The event took $place in 1999 and was remarka...
1,"In the year 2021, something significant happened."
2,He was born in 1856 and lived a long life.
3,"They predict that ^by 2100, technology will be..."
4,The painting was &created in 1821 and restored...
...,...
9995,No specific year is mentioned here.
9996,An ancient artifac**t from 1776 was discovered.
9997,The year 2200 seems like a distant future.
9998,"In 1800, the world was a different place."


Solution 1 result:


Unnamed: 0,Text,result
0,#The event took $place in 1999 and was remarka...,"[#, $, .]"
1,"In the year 2021, something significant happened.","[,, .]"
2,He was born in 1856 and lived a long life.,[.]
3,"They predict that ^by 2100, technology will be...","[^, ,, .]"
4,The painting was &created in 1821 and restored...,"[&, .]"
...,...,...
9995,No specific year is mentioned here.,[.]
9996,An ancient artifac**t from 1776 was discovered.,"[*, *, .]"
9997,The year 2200 seems like a distant future.,[.]
9998,"In 1800, the world was a different place.","[,, .]"



Solution 2 result:


Unnamed: 0,Text,result
0,#The event took $place in 1999 and was remarka...,"[#, $, .]"
1,"In the year 2021, something significant happened.","[,, .]"
2,He was born in 1856 and lived a long life.,[.]
3,"They predict that ^by 2100, technology will be...","[^, ,, .]"
4,The painting was &created in 1821 and restored...,"[&, .]"
...,...,...
9995,No specific year is mentioned here.,[.]
9996,An ancient artifac**t from 1776 was discovered.,"[*, *, .]"
9997,The year 2200 seems like a distant future.,[.]
9998,"In 1800, the world was a different place.","[,, .]"


Solution 1 Time: 0.014722 seconds
Solution 2 Time: 0.024599 seconds


##### 32. Write a Pandas program to remove repetitive characters from the specified column of a given DataFrame.

In [32]:
count = 1000
data = {
    'Text': [
        'aaaaaaa text',
        'other bbbb text',
        'and ccc test text'
    ] * count
}

df = pd.DataFrame(data)

display(df)

def solution_1(df):
    df['result'] = df['Text'].str.replace(r'(\w)\1+', r'\1', regex=True)
    return df

def solution_2(df):
    def rep_char(str1):
        tchr = str1.group(0)
        if len(tchr) > 1:
            return tchr[0:1]
    def unique_char(rep, sent_text):
        convert = re.sub(r'(\w)\1+', rep, sent_text) 
        return convert
    df['result'] = df['Text'].apply(lambda x : unique_char(rep_char,x))
    return df

print('Solution 1 result:')
display(solution_1(df.copy()))
print()
print('Solution 2 result:')
display(solution_2(df.copy()))


test_solutions(solution_1, solution_2, df=df)

Unnamed: 0,Text
0,aaaaaaa text
1,other bbbb text
2,and ccc test text
3,aaaaaaa text
4,other bbbb text
...,...
2995,other bbbb text
2996,and ccc test text
2997,aaaaaaa text
2998,other bbbb text


Solution 1 result:


Unnamed: 0,Text,result
0,aaaaaaa text,a text
1,other bbbb text,other b text
2,and ccc test text,and c test text
3,aaaaaaa text,a text
4,other bbbb text,other b text
...,...,...
2995,other bbbb text,other b text
2996,and ccc test text,and c test text
2997,aaaaaaa text,a text
2998,other bbbb text,other b text



Solution 2 result:


Unnamed: 0,Text,result
0,aaaaaaa text,a text
1,other bbbb text,other b text
2,and ccc test text,and c test text
3,aaaaaaa text,a text
4,other bbbb text,other b text
...,...,...
2995,other bbbb text,other b text
2996,and ccc test text,and c test text
2997,aaaaaaa text,a text
2998,other bbbb text,other b text


Solution 1 Time: 0.011478 seconds
Solution 2 Time: 0.012149 seconds


##### 33. Write a Pandas program to extract numbers greater than 940 from the specified column of a given DataFrame.

In [3]:
import re
count = 1000
data = {
    'Text': [
        '#The event took $place in 999 and was remarkable.',
        'In the year 2021, something significant happened.',
        'He was born in 956 and lived a long life.',
        'They predict that ^by 100, technology will be far advanced.',
        'The painting was &created in 821 and restored in 2015.',
        'No specific year is mentioned here.',
        'An ancient artifac**t from 1776 was discovered.',
        'The year 2200 seems like a distant future.',
        'In 1800, the world was a different place.',
        'Will 2199 be a year of change?'
    ] * count
}

df = pd.DataFrame(data)

display(df)

regex = r'9[4][1-9]|9[5-9]\d|[1-9]\d{3,}'

def solution_1(df):
    df['result'] = df['Text'].str.findall(regex)
    return df

def solution_2(df):    
    def exctract_values(text):
        return re.findall(regex, text)
    df['result'] = df['Text'].apply(exctract_values)
    return df

print('Solution 1 result:')
display(solution_1(df.copy()))
print()
print('Solution 2 result:')
display(solution_2(df.copy()))


test_solutions(solution_1, solution_2, df=df)

Unnamed: 0,Text
0,#The event took $place in 999 and was remarkable.
1,"In the year 2021, something significant happened."
2,He was born in 956 and lived a long life.
3,"They predict that ^by 100, technology will be ..."
4,The painting was &created in 821 and restored ...
...,...
9995,No specific year is mentioned here.
9996,An ancient artifac**t from 1776 was discovered.
9997,The year 2200 seems like a distant future.
9998,"In 1800, the world was a different place."


Solution 1 result:


Unnamed: 0,Text,result
0,#The event took $place in 999 and was remarkable.,[999]
1,"In the year 2021, something significant happened.",[2021]
2,He was born in 956 and lived a long life.,[956]
3,"They predict that ^by 100, technology will be ...",[]
4,The painting was &created in 821 and restored ...,[2015]
...,...,...
9995,No specific year is mentioned here.,[]
9996,An ancient artifac**t from 1776 was discovered.,[1776]
9997,The year 2200 seems like a distant future.,[2200]
9998,"In 1800, the world was a different place.",[1800]



Solution 2 result:


Unnamed: 0,Text,result
0,#The event took $place in 999 and was remarkable.,[999]
1,"In the year 2021, something significant happened.",[2021]
2,He was born in 956 and lived a long life.,[956]
3,"They predict that ^by 100, technology will be ...",[]
4,The painting was &created in 821 and restored ...,[2015]
...,...,...
9995,No specific year is mentioned here.,[]
9996,An ancient artifac**t from 1776 was discovered.,[1776]
9997,The year 2200 seems like a distant future.,[2200]
9998,"In 1800, the world was a different place.",[1800]


Solution 1 Time: 0.016951 seconds
Solution 2 Time: 0.026642 seconds


##### 34. Write a Pandas program to extract numbers less than 100 from the specified column of a given DataFrame.

In [7]:
import re
count = 1000
data = {
    'Text': [
         'The temperature is 98 degrees today.',
        'There are 123 apples in the basket.',
        'He ran 42 miles in the marathon.',
        'The price of the item is $67.',
        'There are 10,000 stars visible tonight.',
        'I have 85 emails to read.',
        'She is 23 years old.',
        'The car costs 15000 dollars.',
        'We need 75 volunteers for the event.',
        'There are 8 planets in our solar system.'
    ] * count
}

df = pd.DataFrame(data)

display(df)

regex = r'\b[1-9]*\d{1}\b'

def solution_1(df):
    df['result'] = df['Text'].str.findall(regex)
    return df

def solution_2(df):    
    def exctract_values(text):
        return re.findall(regex, text)
    df['result'] = df['Text'].apply(exctract_values)
    return df

print('Solution 1 result:')
display(solution_1(df.copy()))
print()
print('Solution 2 result:')
display(solution_2(df.copy()))


test_solutions(solution_1, solution_2, df=df)

Unnamed: 0,Text
0,The temperature is 98 degrees today.
1,There are 123 apples in the basket.
2,He ran 42 miles in the marathon.
3,The price of the item is $67.
4,"There are 10,000 stars visible tonight."
...,...
9995,I have 85 emails to read.
9996,She is 23 years old.
9997,The car costs 15000 dollars.
9998,We need 75 volunteers for the event.


Solution 1 result:


Unnamed: 0,Text,result
0,The temperature is 98 degrees today.,[98]
1,There are 123 apples in the basket.,[123]
2,He ran 42 miles in the marathon.,[42]
3,The price of the item is $67.,[67]
4,"There are 10,000 stars visible tonight.",[10]
...,...,...
9995,I have 85 emails to read.,[85]
9996,She is 23 years old.,[23]
9997,The car costs 15000 dollars.,[]
9998,We need 75 volunteers for the event.,[75]



Solution 2 result:


Unnamed: 0,Text,result
0,The temperature is 98 degrees today.,[98]
1,There are 123 apples in the basket.,[123]
2,He ran 42 miles in the marathon.,[42]
3,The price of the item is $67.,[67]
4,"There are 10,000 stars visible tonight.",[10]
...,...,...
9995,I have 85 emails to read.,[85]
9996,She is 23 years old.,[23]
9997,The car costs 15000 dollars.,[]
9998,We need 75 volunteers for the event.,[75]


Solution 1 Time: 0.021670 seconds
Solution 2 Time: 0.034863 seconds


##### 35. Write a Pandas program to check whether two given words present in a specified column of a given DataFrame.

In [16]:
import re
count = 1000
data = {
    'Text': [
        'hello world',
        'hello mister',
        'world peace hello'
    ] * count
}

df = pd.DataFrame(data)

display(df)

regex = r'^(?=.*\bhello\b)(?=.*\bworld\b).*$'

def solution_1(df):
    df['result'] = df['Text'].str.contains(regex, regex=True)
    return df

def solution_2(df):    
    def exctract_values(text):
        return len(re.findall(regex, text)) != 0
    df['result'] = df['Text'].apply(exctract_values)
    return df

print('Solution 1 result:')
display(solution_1(df.copy()))
print()
print('Solution 2 result:')
display(solution_2(df.copy()))


test_solutions(solution_1, solution_2, df=df)

Unnamed: 0,Text
0,hello world
1,hello mister
2,world peace hello
3,hello world
4,hello mister
...,...
2995,hello mister
2996,world peace hello
2997,hello world
2998,hello mister


Solution 1 result:


Unnamed: 0,Text,result
0,hello world,True
1,hello mister,False
2,world peace hello,True
3,hello world,True
4,hello mister,False
...,...,...
2995,hello mister,False
2996,world peace hello,True
2997,hello world,True
2998,hello mister,False



Solution 2 result:


Unnamed: 0,Text,result
0,hello world,True
1,hello mister,False
2,world peace hello,True
3,hello world,True
4,hello mister,False
...,...,...
2995,hello mister,False
2996,world peace hello,True
2997,hello world,True
2998,hello mister,False


Solution 1 Time: 0.006792 seconds
Solution 2 Time: 0.009178 seconds


##### 36. Write a Pandas program to extract date (format: mm-dd-yyyy) from a given column of a given DataFrame.

In [23]:
import re
count = 1000
data = {
    'Text': [
        'The event is on 08-15-2023.',
        'My birthday is on 12-25-1990.',
        'We met on 03-05-2019 at the cafe.',
        'The deadline is 10-01-2022.',
        'Appointment scheduled for 01-10-2021.'
    ] * count
}

df = pd.DataFrame(data)

display(df)

regex = r'\b[01]\d-[0-3]\d-\d{1,4}\b'

def solution_1(df):
    df['result'] = df['Text'].str.findall(regex)
    return df

def solution_2(df):    
    def exctract_values(text):
        return re.findall(regex, text)
        
    df['result'] = df['Text'].apply(exctract_values)
    return df

print('Solution 1 result:')
display(solution_1(df.copy()))
print()
print('Solution 2 result:')
display(solution_2(df.copy()))


test_solutions(solution_1, solution_2, df=df)

Unnamed: 0,Text
0,The event is on 08-15-2023.
1,My birthday is on 12-25-1990.
2,We met on 03-05-2019 at the cafe.
3,The deadline is 10-01-2022.
4,Appointment scheduled for 01-10-2021.
...,...
4995,The event is on 08-15-2023.
4996,My birthday is on 12-25-1990.
4997,We met on 03-05-2019 at the cafe.
4998,The deadline is 10-01-2022.


Solution 1 result:


Unnamed: 0,Text,result
0,The event is on 08-15-2023.,[08-15-2023]
1,My birthday is on 12-25-1990.,[12-25-1990]
2,We met on 03-05-2019 at the cafe.,[03-05-2019]
3,The deadline is 10-01-2022.,[10-01-2022]
4,Appointment scheduled for 01-10-2021.,[01-10-2021]
...,...,...
4995,The event is on 08-15-2023.,[08-15-2023]
4996,My birthday is on 12-25-1990.,[12-25-1990]
4997,We met on 03-05-2019 at the cafe.,[03-05-2019]
4998,The deadline is 10-01-2022.,[10-01-2022]



Solution 2 result:


Unnamed: 0,Text,result
0,The event is on 08-15-2023.,[08-15-2023]
1,My birthday is on 12-25-1990.,[12-25-1990]
2,We met on 03-05-2019 at the cafe.,[03-05-2019]
3,The deadline is 10-01-2022.,[10-01-2022]
4,Appointment scheduled for 01-10-2021.,[01-10-2021]
...,...,...
4995,The event is on 08-15-2023.,[08-15-2023]
4996,My birthday is on 12-25-1990.,[12-25-1990]
4997,We met on 03-05-2019 at the cafe.,[03-05-2019]
4998,The deadline is 10-01-2022.,[10-01-2022]


Solution 1 Time: 0.009836 seconds
Solution 2 Time: 0.016626 seconds


##### 37. Write a Pandas program to extract only words from a given column of a given DataFrame.

In [27]:
import re
count = 2000000
data = {
    'Text': [
        'Hello world! 123',
        'Python is awesome.',
        '2024 is the year.',
        'I love programming @2024!',
        'Data Science & Machine Learning!'
    ] * count
}

df = pd.DataFrame(data)

display(df)

regex = r'\b\w+\b'

def solution_1(df):
    df['result'] = df['Text'].str.findall(regex)
    return df

def solution_2(df):            
    df['result'] = df['Text'].apply(lambda text: re.findall(regex, text))
    return df

print('Solution 1 result:')
display(solution_1(df.copy()))
print()
print('Solution 2 result:')
display(solution_2(df.copy()))


test_solutions(solution_1, solution_2, df=df)

Unnamed: 0,Text
0,Hello world! 123
1,Python is awesome.
2,2024 is the year.
3,I love programming @2024!
4,Data Science & Machine Learning!
...,...
9999995,Hello world! 123
9999996,Python is awesome.
9999997,2024 is the year.
9999998,I love programming @2024!


Solution 1 result:


Unnamed: 0,Text,result
0,Hello world! 123,"[Hello, world, 123]"
1,Python is awesome.,"[Python, is, awesome]"
2,2024 is the year.,"[2024, is, the, year]"
3,I love programming @2024!,"[I, love, programming, 2024]"
4,Data Science & Machine Learning!,"[Data, Science, Machine, Learning]"
...,...,...
9999995,Hello world! 123,"[Hello, world, 123]"
9999996,Python is awesome.,"[Python, is, awesome]"
9999997,2024 is the year.,"[2024, is, the, year]"
9999998,I love programming @2024!,"[I, love, programming, 2024]"



Solution 2 result:


Unnamed: 0,Text,result
0,Hello world! 123,"[Hello, world, 123]"
1,Python is awesome.,"[Python, is, awesome]"
2,2024 is the year.,"[2024, is, the, year]"
3,I love programming @2024!,"[I, love, programming, 2024]"
4,Data Science & Machine Learning!,"[Data, Science, Machine, Learning]"
...,...,...
9999995,Hello world! 123,"[Hello, world, 123]"
9999996,Python is awesome.,"[Python, is, awesome]"
9999997,2024 is the year.,"[2024, is, the, year]"
9999998,I love programming @2024!,"[I, love, programming, 2024]"


Solution 1 Time: 21.818061 seconds
Solution 2 Time: 30.563356 seconds


##### 38. Write a Pandas program to extract the sentences where a specific word is present in a given column of a given DataFrame.

In [36]:
import re
count = 1000
data = {
    'Text': [
        'hello world',
        'hello mister',
        'world peace hello'
    ] * count
}

df = pd.DataFrame(data)

display(df)

regex = r'(^.*(?=\bworld\b).*$)'

def solution_1(df):
    df['result'] = df['Text'].str.findall(regex)
    return df

def solution_2(df):    
    df['result'] = df['Text'].apply(lambda text: re.findall(regex, text))
    return df

print('Solution 1 result:')
display(solution_1(df.copy()))
print()
print('Solution 2 result:')
display(solution_2(df.copy()))


test_solutions(solution_1, solution_2, df=df)

Unnamed: 0,Text
0,hello world
1,hello mister
2,world peace hello
3,hello world
4,hello mister
...,...
2995,hello mister
2996,world peace hello
2997,hello world
2998,hello mister


Solution 1 result:


Unnamed: 0,Text,result
0,hello world,[hello world]
1,hello mister,[]
2,world peace hello,[world peace hello]
3,hello world,[hello world]
4,hello mister,[]
...,...,...
2995,hello mister,[]
2996,world peace hello,[world peace hello]
2997,hello world,[hello world]
2998,hello mister,[]



Solution 2 result:


Unnamed: 0,Text,result
0,hello world,[hello world]
1,hello mister,[]
2,world peace hello,[world peace hello]
3,hello world,[hello world]
4,hello mister,[]
...,...,...
2995,hello mister,[]
2996,world peace hello,[world peace hello]
2997,hello world,[hello world]
2998,hello mister,[]


Solution 1 Time: 0.004722 seconds
Solution 2 Time: 0.009055 seconds


##### 39. Write a Pandas program to extract the unique sentences from a given column of a given DataFrame.

In [37]:
import re
count = 1000
data = {
    'Text': [
        'hello world',
        'hello mister',
        'world peace hello'
    ] * count
}

df = pd.DataFrame(data)

display(df)

def solution_1(df):
    return df['Text'].drop_duplicates()

# def solution_2(df):    
#     df['result'] = df['Text'].apply(lambda text: re.findall(regex, text))
#     return df

print('Solution 1 result:')
display(solution_1(df.copy()))
# print()
# print('Solution 2 result:')
# display(solution_2(df.copy()))


test_solutions(solution_1, df=df)

Unnamed: 0,Text
0,hello world
1,hello mister
2,world peace hello
3,hello world
4,hello mister
...,...
2995,hello mister
2996,world peace hello
2997,hello world
2998,hello mister


Solution 1 result:


0          hello world
1         hello mister
2    world peace hello
Name: Text, dtype: object

Solution 1 Time: 0.000443 seconds


In [42]:
import re
count = 10000
data = {
    'Text': [
        'hello world\nhello world',
        'hello mister',
        'world peace hello\nworld peace hello'
    ] * count
}

df = pd.DataFrame(data)

display(df)

regex = r'(?sm)(^[^\r\n]+$)(?!.*^\1$)'

def solution_1(df):
    df['result'] = df['Text'].str.findall(regex)
    return df

def solution_2(df):    
    df['result'] = df['Text'].apply(lambda text: re.findall(regex, text))
    return df

print('Solution 1 result:')
display(solution_1(df.copy()))
print()
print('Solution 2 result:')
display(solution_2(df.copy()))


test_solutions(solution_1, solution_2, df=df)

Unnamed: 0,Text
0,hello world\nhello world
1,hello mister
2,world peace hello\nworld peace hello
3,hello world\nhello world
4,hello mister
...,...
29995,hello mister
29996,world peace hello\nworld peace hello
29997,hello world\nhello world
29998,hello mister


Solution 1 result:


Unnamed: 0,Text,result
0,hello world\nhello world,[hello world]
1,hello mister,[hello mister]
2,world peace hello\nworld peace hello,[world peace hello]
3,hello world\nhello world,[hello world]
4,hello mister,[hello mister]
...,...,...
29995,hello mister,[hello mister]
29996,world peace hello\nworld peace hello,[world peace hello]
29997,hello world\nhello world,[hello world]
29998,hello mister,[hello mister]



Solution 2 result:


Unnamed: 0,Text,result
0,hello world\nhello world,[hello world]
1,hello mister,[hello mister]
2,world peace hello\nworld peace hello,[world peace hello]
3,hello world\nhello world,[hello world]
4,hello mister,[hello mister]
...,...,...
29995,hello mister,[hello mister]
29996,world peace hello\nworld peace hello,[world peace hello]
29997,hello world\nhello world,[hello world]
29998,hello mister,[hello mister]


Solution 1 Time: 0.040959 seconds
Solution 2 Time: 0.075497 seconds


##### 40. Write a Pandas program to extract words starting with capital words from a given column of a given DataFrame.

In [46]:
import re
count = 1
data = {
    'Text': [
        'hello World Hello world',
        'hello Mister',
        'world peace hello world peace hello',
        'one More word',
        'and Another Big word'
    ] * count
}

df = pd.DataFrame(data)

display(df)

regex = r'\b[A-Z]\w*\b'

def solution_1(df):
    df['result'] = df['Text'].str.findall(regex)
    return df

def solution_2(df):    
    df['result'] = df['Text'].apply(lambda text: re.findall(regex, text))
    return df

print('Solution 1 result:')
display(solution_1(df.copy()))
print()
print('Solution 2 result:')
display(solution_2(df.copy()))


test_solutions(solution_1, solution_2, df=df)

Unnamed: 0,Text
0,hello World Hello world
1,hello Mister
2,world peace hello world peace hello
3,one More word
4,and Another Big word


Solution 1 result:


Unnamed: 0,Text,result
0,hello World Hello world,"[World, Hello]"
1,hello Mister,[Mister]
2,world peace hello world peace hello,[]
3,one More word,[More]
4,and Another Big word,"[Another, Big]"



Solution 2 result:


Unnamed: 0,Text,result
0,hello World Hello world,"[World, Hello]"
1,hello Mister,[Mister]
2,world peace hello world peace hello,[]
3,one More word,[More]
4,and Another Big word,"[Another, Big]"


Solution 1 Time: 0.000866 seconds
Solution 2 Time: 0.000774 seconds


##### 41. Write a Pandas program to remove the html tags within the specified column of a given DataFrame.

In [48]:
import re
count = 1
data = {
    'Text': [
        '<p>This is a <b>bold</b> paragraph.</p>',
        '<div><h1>Header</h1><p>Some content here.</p></div>',
        'No HTML tags here, just plain text.',
        '<a href="http://example.com">Example link</a>',
        '<span>Another <i>italic</i> text.</span>'
    ] * count
}

df = pd.DataFrame(data)

display(df)

regex = r'<.*?>'

def solution_1(df):
    df['result'] = df['Text'].str.replace(regex, '')
    return df

def solution_2(df):    
    df['result'] = df['Text'].apply(lambda text: re.sub(regex, '', text))
    return df

print('Solution 1 result:')
display(solution_1(df.copy()))
print()
print('Solution 2 result:')
display(solution_2(df.copy()))


test_solutions(solution_1, solution_2, df=df)

Unnamed: 0,Text
0,<p>This is a <b>bold</b> paragraph.</p>
1,<div><h1>Header</h1><p>Some content here.</p><...
2,"No HTML tags here, just plain text."
3,"<a href=""http://example.com"">Example link</a>"
4,<span>Another <i>italic</i> text.</span>


Solution 1 result:


Unnamed: 0,Text,result
0,<p>This is a <b>bold</b> paragraph.</p>,<p>This is a <b>bold</b> paragraph.</p>
1,<div><h1>Header</h1><p>Some content here.</p><...,<div><h1>Header</h1><p>Some content here.</p><...
2,"No HTML tags here, just plain text.","No HTML tags here, just plain text."
3,"<a href=""http://example.com"">Example link</a>","<a href=""http://example.com"">Example link</a>"
4,<span>Another <i>italic</i> text.</span>,<span>Another <i>italic</i> text.</span>



Solution 2 result:


Unnamed: 0,Text,result
0,<p>This is a <b>bold</b> paragraph.</p>,This is a bold paragraph.
1,<div><h1>Header</h1><p>Some content here.</p><...,HeaderSome content here.
2,"No HTML tags here, just plain text.","No HTML tags here, just plain text."
3,"<a href=""http://example.com"">Example link</a>",Example link
4,<span>Another <i>italic</i> text.</span>,Another italic text.


Solution 1 Time: 0.001032 seconds
Solution 2 Time: 0.001016 seconds
