# Week 11
# Data Transformations


## I. Remove Duplicates

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

In [None]:
# Identify duplicated rows
data.duplicated()

In [None]:
# Drop duplicated rows
data.drop_duplicates()

In [None]:
# Drop duplicated values from column k1
data.drop_duplicates(['k2'])

## II. Transform Data Using a Function or Mapping

In [None]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

In [None]:
# Suppose that we want to map the meat type to the kind of animal:
meat_to_animal = {
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}

In [None]:
# To make matching simpler, change strings to lowercase first
lowercased = data['food'].str.lower()
# lowercased
data['animal'] = lowercased.map(meat_to_animal)
data

In [None]:
# We can also pass a function
data['food'].map(lambda x: meat_to_animal[x.lower()])

In [None]:
# Lambda function: a simple way to write a short function
def square(x):
    return x * x
square(2)

In [None]:
df = pd.DataFrame({'Col1': [1, 2, 3, 4]})

df['Col2'] = df['Col1'].apply(square)
df

In [None]:
df['Col3'] = df['Col1'].apply(lambda x: x ** 2)
df

## III. String Manipulation


### 1. String Methods

In [None]:
# Use split() to separate a string
string = "a, b, c, d"
string.split(',')

In [None]:
# split() is often combined with strip to trim whitespace
string_pieces = string.split(',')
print(string_pieces)
string_pieces_cleaned = [x.strip() for x in string_pieces]
print(string_pieces_cleaned)

In [None]:
# Use + to concatenate strings
string = "I" + " " + "like" + " " + "pizza."
print(string)

In [None]:
# Use join() to concatenate a list of strings with delimiter
names = ["Alex", "Brian", "Charlie", "Douglas"]
string = ", ".join(names)
print(string)

In [None]:
# Use index() and find() to detect a substring
alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
print("DEF" in alphabet)
print(alphabet.find("Alex")) # find() will return -1 if the substring does not exist
print(alphabet.index("DEF"))
# print(alphabet.find("abc"))

In [None]:
# Extract the substring from alphabet starting at index 10, ending at index 20
substring = alphabet[10:20]
print(substring)

In [None]:
# count() returns the number of occurences of a substring
print(alphabet.count("DEF"))
print(string.count(" "))

In [None]:
# replace() is used to replace a substring for another
print(string.replace("Alex", "Alexander"))

In [None]:
# replace() can also be used to delete a substring:
print(string.replace(", ", ""))

### 2. Regular Expressions
**Regular expressions** provide a flexible way to search or match complex string patterns in text.Python's built-in `re` module is responsible for applying regular expressions to string. Let's have a look at some examples.

In [None]:
import re
# Example 1: Split a string with a variable number of whitespace
string = "a  b    c    d \t e  \n  f   g"
print(string)
# string.split(' ') # This does not work
pieces = re.split('\s+', string) # \s represents the whitespace character, + means one or more.
print(pieces)

Useful `re` functions:
- findall()
- search()
- split()
- sub()

In [None]:
re.findall('\s+', string)

In [None]:
match = re.search('\s+', string)
print("Substring:", match.group())
print("Location:", match.span())
print("Start:", match.start())
print("End:", match.end())

In [None]:
re.sub('\s+', ',', string)

**Construct a regular expression:**

[Reference](https://medium.com/factory-mind/regex-tutorial-a-simple-cheatsheet-by-examples-649dc1c3f285)

1. Anchors
    - ^The: **Starts with** The
    - day\$: **Ends with** day
2. Quantifiers:
    - ab\s\*: ab followed by **zero or more** whitespaces
    - ab\s+: ab followed by **one or more** whitespaces
    - ab\s?: ab followed by **zero or one** whitespaces
    - ab\s{2}: ab followed by **exactly 2** whitespaces
    - ab\s{2, 5}: ab followed by **2 - 5** whitespaces
    - ab\s{2, }: ab followed by **2 or more** whitespaces
3. OR operator
    - a(b|c): a followed by **b or c**
    - a[bc]: same as above
4. Character classes
    - \d: a single digit
    - \w: a single letter or underscore
    - \s: a single whitespace
    - .: any character
    - \D: a single non-digit
    - \W: a single character that is not a letter or underscore
    - \S: a single non-space
5. Bracket expression
    - [a-c]: a or b or c
    - [0-7]: a digit between 0 and 7
    - [^a-c]: a letter not a, b, or c
6. Greedy match
    - <*+{}>: any character included in <>, **expanding as far as possible**
7. Capturing:
    - a(bc): **capture** the group with value bc

In [None]:
# Example 1: Extract Social Security Number
string = "My SSN is: 123-45-6789"
pattern = ".*(\d{3})-(\d{2})-(\d{4})"
regex = re.compile(pattern)
match = regex.match(string)
print(match.groups())

In [None]:
# Example 2: Extract phone numbers
string = "My phone number is: (347)123-4567"


In [None]:
# Example 3: Validate an email addresses
string1 = "Liang.Zhao1@lehman.edu"
string2 = "liang.Zhao1.cuny.edu"
pattern = "^[a-z0-9]+[\._]?[a-z0-9]+[@]\w+[.]\w+$"
regex = re.search(pattern, string1.lower())
print(regex)