# Python Basic

### File Operations

Read file line by line

In [29]:
lines = []
for line in open('building_global_community.txt'):
    # delete the blank and line feed at the begining and end
    line = line.strip()
    # add processed line text into list 'lines'
    lines.append(line)

In [30]:
lines[0]

'To our community,'

or you can just write

In [31]:
# list comprehension
lines = [line.strip() for line in open('building_global_community.txt')]

In [32]:
lines[0]

'To our community,'

### String operations

In [10]:
sentence = "I want to eat an apple ."

#### string indexing

In [11]:
sentence[5]

't'

In [12]:
sentence[10:13]

'eat'

In [13]:
sentence[-1]

'.'

In [14]:
sentence[10:-3]

'eat an appl'

#### find sequences in string

In [15]:
sentence.find('a')

3

find from right-hand side

In [16]:
sentence.rfind('a')

17

find with a starting point

In [17]:
sentence.find('a', 4)

11

return -1 when not found

In [18]:
sentence.find('can')

-1

combine the use of subsequence and find

In [19]:
sentence[sentence.find('want to'):sentence.rfind('.')]

'want to eat an apple '

### String Normalization

In [20]:
sentence

'I want to eat an apple .'

In [21]:
sentence.lower()

'i want to eat an apple .'

In [82]:
sentence.upper()

'I WANT TO EAT AN APPLE .'

In [83]:
sentence.capitalize()

'I want to eat an apple .'

In [86]:
'A'.isupper()

True

In [87]:
'A'.islower()

False

In [92]:
'apple'.isalpha()

True

In [93]:
'20'.isdigit()

True

In [94]:
'20.9'.isdigit()

False

In [99]:
'20'.isdecimal()

True

In [101]:
'furen5566'.isalnum()

True

### split sentence by blank

In [78]:
# the result is list of words in the sentence
sentence.split(' ')

['I', 'want', 'to', 'eat', 'an', 'apple', '.']

In [102]:
sentence.endswith('.')

True

In [103]:
sentence.startswith('He wants')

False

## Dictionary examples

In [26]:
# book = dict()
book = {}

In [27]:
book['title'] = 'Natural Language Processing with Python'
book['author'] = 'Bird, Klein, and Loper'
book['year'] = 2009

In [107]:
book

{'author': 'Bird, Klein, and Loper',
 'title': 'Natural Language Processing with Python',
 'year': 2009}

In [112]:
book.keys()

dict_keys(['year', 'title', 'author'])

In [113]:
book.values()

dict_values([2009, 'Natural Language Processing with Python', 'Bird, Klein, and Loper'])

In [114]:
book.items()

dict_items([('year', 2009), ('title', 'Natural Language Processing with Python'), ('author', 'Bird, Klein, and Loper')])

string formatting

In [28]:
'%s is a book written by %s in %d' % (book['title'], book['author'], book['year'])

'Natural Language Processing with Python is a book written by Bird, Klein, and Loper in 2009'

In [108]:
'{0} is a book written by {1} in {2}'.format(book['title'], book['author'], book['year'])

'Natural Language Processing with Python is a book written by Bird, Klein, and Loper in 2009'

In [109]:
# advanced formatting
'{title} is a book written by {author} in {year}'.format(**book)

'Natural Language Processing with Python is a book written by Bird, Klein, and Loper in 2009'

## Counting Example

In [123]:
data = ['red', 'red', 'red', 'red', 'yellow', 'yellow', 'yellow', 'blue', 'blue']

In [124]:
counter = dict()
for color in data:
    if color in counter:
        counter[color] += 1
    else:
        counter[color] = 1

In [125]:
counter

{'blue': 2, 'red': 4, 'yellow': 3}

### use default dictionary

In [129]:
from collections import defaultdict
counter = defaultdict(lambda: 0)  # default value function is 0
counter = defaultdict(int)  # default value function is "int", which initialize to 0

In [130]:
for color in data:
    counter[color] += 1

In [131]:
counter

defaultdict(int, {'blue': 2, 'red': 4, 'yellow': 3})

### use built-in Counter

In [159]:
from collections import Counter

In [160]:
counter = Counter(data)

In [161]:
counter

Counter({'blue': 2, 'red': 4, 'yellow': 3})

In [162]:
new_data = ['blue', 'red', 'blue', 'yellow', 'blue', 'yellow', 'blue', 'yellow', 'blue']
counter.update(new_data)

In [163]:
counter

Counter({'blue': 7, 'red': 5, 'yellow': 6})

#### most common elements

In [164]:
counter.most_common()

[('blue', 7), ('yellow', 6), ('red', 5)]

In [165]:
counter.most_common(2)

[('blue', 7), ('yellow', 6)]

In [166]:
for color, count in counter.most_common():
    print('{0}: {1}'.format(color, count))

blue: 7
yellow: 6
red: 5


In [147]:
# clear counter
counter.clear()
print(counter['blue'])

0


# Exercise

compute the word frequencies in "Building_Global_Community.txt"
- read sentences from file "Building_Global_Community.txt"
- split sentences into words (split, or nltk word_tokenize)
- filter out symbols (isalpha, isdigit, isalnum)
- normalize words and count ('Word' and 'word' are considered as the same word)
- count the occurance of words (counting exmaple)

write your code here

In [6]:
# write your code here
...

In [7]:
wordCounter.most_common(20)

[('.', 261),
 (',', 256),
 ('to', 220),
 ('the', 181),
 ('and', 175),
 ('we', 161),
 ('of', 129),
 ('a', 127),
 ('our', 111),
 ('in', 89),
 ('is', 88),
 ('community', 80),
 ('that', 71),
 ('people', 62),
 ('for', 62),
 ('are', 55),
 ('this', 48),
 ('more', 46),
 ('can', 45),
 ('with', 44)]

### Save the result into a csv file

https://docs.python.org/3/library/csv.html

In [8]:
import csv

write word count result

In [9]:
with open('wordcount.csv', 'w') as csvfile:
    # set up header
    fieldnames = ['word', 'count']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for word, count in wordCounter.most_common():
        writer.writerow({'word': word, 'count': count})

read csv

In [11]:
with open('wordcount.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        print(row['word'], row['count'])

. 261
, 256
to 220
the 181
and 175
we 161
of 129
a 127
our 111
in 89
is 88
community 80
