# 파이썬 웹 크롤링 & 텍스트 분석

## 1. Python Preliminaries
- urllib (urlopen)
- bs4 (BeautifulSoup)

### Basic Data Types
- ```Boolean```
- ```Integer```
- ```Float```
- ```String```

In [1]:
## Boolean
x = True
y = False

print(x == y)
print(x != y)

False
True


In [2]:
## Numerical
x = 4
y = 5.0

print(type(x))
print(type(y))
print(x + y)

<class 'int'>
<class 'float'>
9.0


In [8]:
## String
x = "Hello"
y = " "
z = "World"

# indexing & slicing
print(x[0])
print(z[-1])
print(z[2:])
print(x[:2])

# string concatenation
print(x + y + z)

# formatting
print("Hello {}!".format("Python"))
print("Hello {}!".format("Java"))

H
d
rld
He
Hello World
Hello Python!
Hello Java!


In [9]:
## Type casting
x = 1
print(x)
x = float(x)
print(x)

y = "106"
print(y)
y = int(y)
print(y)

1
1.0
106
106


### Python Data Structures
- ```Tuple```
- ```List```
- ```Set```
- ```Dictionary```

In [10]:
## creating data structure
atuple = (1, 2, 3)
alist = [1, 2, 3]
aset = {1, 2, 3}
adict = {1: "John", 2: "Jane", 3: "Sandy"}

In [11]:
## in operator
print(1 in atuple)
print(2 in alist)
print(100 in aset)

True
True
False


In [12]:
## indexing & slicing
alist = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]

print(alist[0])
print(alist[-1])
print(alist[5:])
print(alist[:7])

2
20
[12, 14, 16, 18, 20]
[2, 4, 6, 8, 10, 12, 14]


In [14]:
## 2-D list

alist = [[1,2], [3,4], [5,6]]

print(alist[-1])
print(alist[0][0])

[5, 6]
1


### Conditionals and Loops
- ```if-else```
- ```for x in y```
- ```while x```

In [15]:
# if-elif-else 
x = 5
if x > 0:
    print('x is positive')
elif x < 0:
    print('x is negative')
else:
    print('x is zero')

x is positive


In [16]:
# iterating with for loop
names = ['Johnny', 'Jane', 'Sam', 'Kim']
# as there are four elements in list, actions are repeated for four times
for name in names:
    print(name)

Johnny
Jane
Sam
Kim


In [17]:
# using for loop and if statement together
ids = [('Johnny', 1), ('Jane', 2), ('Sam', 3), ('Kim', 4)]
for name, identifier in ids:
    if identifier % 2 == 0:
        print(name)    # print names with only even identifiers

Jane
Kim


### Functions and Modules
- Functions
- Modules & Packages

In [18]:
# range() function
x = range(5)                  
for i in x:
    print(i)

0
1
2
3
4


In [19]:
# one arguments 
for i in range(3):
    print(i)

0
1
2


In [20]:
# two arguments
for i in range(1, 3):
    print(i)

1
2


In [21]:
# with three arguments
for i in range(1, 10, 2):
    print(i)

1
3
5
7
9


In [22]:
# with negative step size
for i in range(5, 1, -1):
    print(i)

5
4
3
2


In [23]:
# function with argument & return object
def my_function(language):
    sentence = 'I like ' + language
    return sentence

# By using arguments, we can enhance reusability of functions!!!
x = my_function('Python')
print(x)
y = my_function('Java')
print(y)
z = my_function('C++')
print(z)

I like Python
I like Java
I like C++


### Modules & Packages
```ruby
import package_name [as alias]
import package_name.module_name
from package_name import module_name
from package_name import *

import module_name [as alias]
import module_name.function_name
from module_name import function_name
from module_name import *
```

In [24]:
import math         # import package
x = math.exp(10)
print(x)

import math as m     # use alias
x = m.exp(10)
print(x)

from math import exp  # import function from package
x = exp(10)
print(x)

from math import *    # import all classes/functions in package
x = exp(10)
print(x)

22026.465794806718
22026.465794806718
22026.465794806718
22026.465794806718


In [27]:
## Using Counter class in collections package
from collections import Counter
l = [1, 1, 7, 7, 7, 4, 4, 4, 2, 1, 5, 5, 9, 11, 3, 'a', 'x', 9, 8, 'b', 'b', 'z', 'b']
counts = Counter(l)
#print(counts)
for key in counts:
    # print(key)
    if counts[key] > 2:
        print(key)

1
7
4
b


### File Handling
- Reading text files
- Writing text files

In [None]:
## read()
file = open('text.txt', 'r', encoding = 'utf-8')
data = file.read()
print(type(data))

In [None]:
## readlines()
file = open('text.txt', 'r', encoding = 'utf-8')
data = file.readlines()
print(type(data))
file.close()

In [None]:
## readline()
file = open('text.txt', 'r', encoding = 'utf-8')
data = file.readline()
print(type(data))
file.close()

In [None]:
## one problem is that write() does not distinguish lines per se
# writing strings line by line (with '\n')
file = open('new_file_3.txt', 'w', encoding = 'utf-8')
file.write('To be, or not to be - that is the question: \n')
file.write("Whether 'tis nobler in the mind to suffer \n")
file.write('The slings and arrows of outrageous fortune \n')
file.write('Or to take arms against a sea of troubles \n')
file.write('And by opposing end them')
file.close()