# Week 11
# Data Transformations


## I. Remove Duplicates

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [3]:
# Identify duplicated rows
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [4]:
# Drop duplicated rows
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [5]:
# Drop duplicated values from column k2
data.drop_duplicates(['k2'])

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3
5,two,4


In [6]:
# Drop duplicated values from column k1
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2
0,one,1
1,two,1


## II. Transform Data Using a Function or Mapping

In [7]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [8]:
# Suppose that we want to map the meat type to the kind of animal:
meat_to_animal = {
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}

In [9]:
# To make matching simpler, change strings to lowercase first
lowercased = data['food'].str.lower()
# lowercased
data['animal'] = lowercased.map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [10]:
# We can also pass a function
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [11]:
# The lambda expression defines a function without name.
# We can also use apply() to apply this function.
data['food'].apply(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [12]:
# Lambda function: a simple way to write a short function
def square(x):
    return x * x
square(2)

4

In [14]:
df = pd.DataFrame({'Col1': [1, 2, 3, 4]})
df

Unnamed: 0,Col1
0,1
1,2
2,3
3,4


In [15]:
df['Col2'] = df['Col1'].apply(square)
df

Unnamed: 0,Col1,Col2
0,1,1
1,2,4
2,3,9
3,4,16


In [16]:
df['Col3'] = df['Col1'].apply(lambda x: x ** 2)
df

Unnamed: 0,Col1,Col2,Col3
0,1,1,1
1,2,4,4
2,3,9,9
3,4,16,16


In [17]:
# lambda expression with a branch
def abs_val(x):
    if x >= 0:
        return x
    else:
        return -x
    
abs_val(-100)

100

In [18]:
df['abs'] = df['Col1'].apply(lambda x: x if x >= 0 else -x)
df

Unnamed: 0,Col1,Col2,Col3,abs
0,1,1,1,1
1,2,4,4,2
2,3,9,9,3
3,4,16,16,4


## III. String Manipulation


### 1. String Methods

In [19]:
# Use split() to separate a string
string = "a, b, c, d"
string.split(',')

['a', ' b', ' c', ' d']

In [20]:
# split() is often combined with strip to trim whitespace
string_pieces = string.split(',')
print(string_pieces)
string_pieces_cleaned = [x.strip() for x in string_pieces]
print(string_pieces_cleaned)

['a', ' b', ' c', ' d']
['a', 'b', 'c', 'd']


In [21]:
# Use + to concatenate strings
string = "I" + " " + "like" + " " + "pizza."
print(string)

I like pizza.


In [25]:
# Use join() to concatenate a list of strings with delimiter
names = ["Alex", "Brian", "Charlie", "Douglas"]
string = "\n".join(names)
print(string)

Alex
Brian
Charlie
Douglas


In [29]:
# Use index() and find() to detect a substring
alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
print("DEF" in alphabet)
print(alphabet.find("Alex")) # find() will return -1 if the substring does not exist
print(alphabet.index("DEF"))
print(alphabet.find("abc"))

True
-1
3
-1


In [30]:
# Extract the substring from alphabet starting at index 10, ending at index 20
substring = alphabet[10:20]
print(substring)

KLMNOPQRST


In [31]:
# count() returns the number of occurences of a substring
print(alphabet.count("DEF"))
print(string.count(" "))

1
0


In [32]:
# replace() is used to replace a substring for another
print(string.replace("Alex", "Alexander"))

Alexander
Brian
Charlie
Douglas


In [33]:
# replace() can also be used to delete a substring:
print(string.replace("\n", ""))

AlexBrianCharlieDouglas


### 2. Regular Expressions
**Regular expressions** provide a flexible way to search or match complex string patterns in text.Python's built-in `re` module is responsible for applying regular expressions to string. Let's have a look at some examples.

In [37]:
import re
# Example 1: Split a string with a variable number of whitespace
string = "a  b    c    d \t e  \n  f   g"
print(string)
# string.split(' ') # This does not work
pieces = re.split('\s+', string) # \s represents the whitespace character, + means one or more.
print(pieces)

a  b    c    d 	 e  
  f   g
['a', 'b', 'c', 'd', 'e', 'f', 'g']


Useful `re` functions:
- findall()
- search()
- split()
- sub()

In [38]:
re.findall('\s+', string)

['  ', '    ', '    ', ' \t ', '  \n  ', '   ']

In [43]:
match = re.search('\s+', string)
# print(match)
print("Substring:", match.group())
print("Location:", match.span())
print("Start:", match.start())
print("End:", match.end())

Substring:   
Location: (1, 3)
Start: 1
End: 3


In [44]:
re.sub('\s+', ',', string)

'a,b,c,d,e,f,g'

**Construct a regular expression:**

[Reference](https://medium.com/factory-mind/regex-tutorial-a-simple-cheatsheet-by-examples-649dc1c3f285)

1. Anchors
    - ^The: **Starts with** The
    - day\$: **Ends with** day
2. Quantifiers:
    - ab\s\*: ab followed by **zero or more** whitespaces
    - ab\s+: ab followed by **one or more** whitespaces
    - ab\s?: ab followed by **zero or one** whitespaces
    - ab\s{2}: ab followed by **exactly 2** whitespaces
    - ab\s{2, 5}: ab followed by **2 - 5** whitespaces
    - ab\s{2, }: ab followed by **2 or more** whitespaces
3. OR operator
    - a(b|c): a followed by **b or c**
    - a[bc]: same as above
4. Character classes
    - \d: a single digit
    - \w: a single letter or underscore
    - \s: a single whitespace
    - .: any character
    - \D: a single non-digit
    - \W: a single character that is not a letter or underscore
    - \S: a single non-space
5. Bracket expression
    - [a-c]: a or b or c
    - [0-7]: a digit between 0 and 7
    - [^a-c]: a letter not a, b, or c
6. Greedy match
    - <*+{}>: any character included in <>, **expanding as far as possible**
7. Capturing:
    - a(bc): **capture** the group with value bc

In [50]:
# Example 1: Extract Social Security Number
string = "Is 123-45-6789 your SSN? Answer: No."
pattern = ".*(\d{3})-(\d{2})-(\d{4}).*"
regex = re.compile(pattern)
# print(regex)
match = regex.match(string)
# print(match)
print("".join(match.groups()))

123456789


In [None]:
# Example 2: Extract phone numbers
string = "My phone number is: (347)123-4567"
pattern = "??????????"
regex re.comile(pattern)
match = regex.match(string)
print(match.groups()) # It should return [347, 123, 4567].

In [None]:
# Example 3: Validate an email addresses
string1 = "Liang.Zhao1@lehman.edu"
string2 = "liang.Zhao1.cuny.edu"
pattern = "^[a-z0-9]+[\._]?[a-z0-9]+[@]\w+[.]\w+$"
regex = re.search(pattern, string1.lower())
print(regex)