# Machine Learning

In [None]:
import pandas
import seaborn

df1 = pandas.read_csv("https://raw.githubusercontent.com/YONESI-DBIS/DS_Lecture/main/UB_Data.csv")
print(df1)

df1_desc = df1.groupby('Age')['Income'].describe()
df1_desc = df1_desc.reset_index()
print(df1_desc)

df2 = seaborn.load_dataset('tips')
print(df2)

## 군집화(Clustering) k-means

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# 데이터 준비
clustering_data = df2[['total_bill','tip']]

# 정규분포(mean = 0, std. = 1)로 표준화
sc = StandardScaler()
clustering_data_scaled = sc.fit_transform(clustering_data)

# Clustering 수행
kmeans = KMeans(n_clusters = 4, random_state = 0)
clusters = kmeans.fit(clustering_data_scaled)

print(clusters.labels_)

In [None]:
# 원 데이터에 클러스터 레이블 추가
clustering_data['cluster_label'] = clusters.labels_
print(clustering_data)

# 클러스터링 결과 확인
clustering_data.groupby('cluster_label').count()

In [None]:
# 클러스터링 결과 시각화

import seaborn

chart = seaborn.lmplot(x = 'total_bill', y='tip', data=clustering_data, fit_reg=False, hue='cluster_label')

## 주성분분석(PCA, Principal Component Analysis )

In [None]:
from sklearn.decomposition import PCA

clustering_ubdata = df1[['Age', 'Income', 'CCAvg', 'Mortgage']]
clustering_ubdata_scaled = sc.fit_transform(clustering_ubdata)

# 주성분분석 
pca = PCA(n_components=2)
pca.fit(clustering_ubdata_scaled)

# 주성분만 포함하도록 데이터 변환
clustering_ubdata_scaled_pca = pca.transform(clustering_ubdata_scaled)

print(clustering_ubdata_scaled)
print(clustering_ubdata_scaled_pca)


## 결정트리(Decision Tree)

In [None]:
from sklearn.tree import DecisionTreeClassifier
import numpy as np

feature = df1[['CCAvg','Income']]
feature = feature.to_numpy()

decision = df1['CDAccount']
decision = decision.to_numpy()

tree = DecisionTreeClassifier().fit(feature, decision)

In [None]:
import matplotlib.pyplot as plt

def visualize_classifier(model, X, y, ax=None, cmap='rainbow'):
    ax = ax or plt.gca()
    
    # Plot the training points
    ax.scatter(X[:, 0], X[:, 1], c=y, s=30, cmap=cmap,
               clim=(y.min(), y.max()), zorder=3)
    ax.axis('tight')
    ax.axis('off')
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()
    
    # fit the estimator
    model.fit(X, y)
    xx, yy = np.meshgrid(np.linspace(*xlim, num=200),
                         np.linspace(*ylim, num=200))
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)

    # Create a color plot with the results
    n_classes = len(np.unique(y))
    contours = ax.contourf(xx, yy, Z, alpha=0.3,
                           levels=np.arange(n_classes + 1) - 0.5,
                           cmap=cmap, clim=(y.min(), y.max()),
                           zorder=1)

    ax.set(xlim=xlim, ylim=ylim)


visualize_classifier(DecisionTreeClassifier(), feature, decision)

In [None]:
from sklearn.datasets import make_blobs

X, y = make_blobs(n_samples=300, centers=4, random_state=0, cluster_std=1.0)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='rainbow');

In [None]:
tree = DecisionTreeClassifier().fit(X, y)

visualize_classifier(DecisionTreeClassifier(), X, y)

## 심층학습(Deep Learning)

MNST 데이터셋을 사용한 handwriting recognition

https://sdc-james.gitbook.io/onebook/4.-and/5.1./5.1.3.-mnist-dataset

In [None]:
import tensorflow as tf

mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

print(x_train)
print(y_train)

In [None]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(28, 28)),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(10, activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(x_train, y_train, epochs=5)

In [None]:
model.evaluate(x_test,  y_test, verbose=2)

# Other Applications

## 정규 표현식 (Regular Expression)

- 문자 클래스 [ ] : 해당 문자중 한 개의 문자와 매치 ( - 를 사용하여 범위로 지정할 수도 있음)
    - ^는 not의 의미
    - [a-zA-Z]: 알파벳 모두, [0-9]: 숫자 모두
    - \d : 숫자
    - \D : not 숫자
    - \s : whitespace 문자
    - \S : not whitespace 문자
    - \w : 문자+숫자
    - \W : not 문자+숫자
- Dot(.): 줄마꿈 문자(\n)를 제외한 any 문자

- 반복(*): 바로 앞의 문자가 0번에서 무한대번 반복
- 반복(+): 바로 앞의 문자가 1번에서 무한대번 반복
- 반복({m,n}, ?)
    - {m}: 바로 앞의 문자가 m번 반복
    - {m,n}: 바로 앞의 문자가 m번에서 n번 반복
    - ?: 바로 앞의 문자가 있어도 되고 없어도 됨. 즉, {0,1}과 동일

In [None]:
import re

- match() : 문자열의 처음부터 정규식과 매치되는지 조사  -> 매치되면 match 객체 리턴 / 매치되지 않으면 None 리턴
- search() : 문자열 전체를 검색하여 정규식과 매치되는지 조사  -> 매치되면 match 객체 리턴 / 매치되지 않으면 None 리턴
- findall() : 정규식과 매치되는 모든 substring을 리스트로 리턴
- finditer() : 정규식과 매치되는 모든 substring을 반복 가능한 객체로 리턴 (반복 가능한 객체는 match 객체들을 반복적으로 리턴)
- sub(): 정규식과 매치되는 모든 substring을 다른 string으로 바꿈

In [None]:
import re

p = re.compile('[a-z]+')

m = p.match("python")
print(m)

m = p.match("3 python")
print(m)

m = p.search("3 python")
print(m)

p = re.compile('[a-z]+')

m = p.match("python")
if m:
    print('Match found: ', m.group())
else:
    print('No match')
    
#축약도 가능
m = re.match('[a-z]+', "python")
print(m)

In [None]:
import re

p = re.compile('[a-z]+')

result = p.findall('life is too short')
print(result)

result = p.findall('Life is too Short')
print(result)

#p = re.compile('[a-zA-Z]+')
#result = p.findall('Life is too Short')
#print(result)

#text_data = 'life 13 is too 12 short!!!'
#text_modified = re.sub('[^a-zA-Z]+',' ',text_data)
#print (text_modified)

#text_data = '본 연구는 자카드 유사도(Jacard Similarity)가 다른 집합 유사도(Set Similarity)와 어떤 차이가 있는지를 보인다.'
#text_modified = re.sub("\(.+\)", '', text_data)
#print(text_data)
#print(text_modified)

## 간단 텍스트 마이닝(Simple Text Mining)

In [None]:
from collections import Counter

import pandas as pd
import re

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag

import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

text_data = '''
In Gloucester, Massachusetts, Ruby Rossi is the only hearing member of her family: her parents Frank and Jackie and older brother Leo are all culturally deaf. She assists with the family fishing business and plans to join it full-time after finishing high school. Ruby auditions for the school choir, run by Bernardo Villalobos (or Mr. V), but when she is called upon to sing, she panics and runs away. She later returns to Mr. V and explains that she was bullied for talking funny as a child. Mr. V accepts Ruby into the choir after hearing her beautiful voice and encourages her to be more confident.
Mr. V pairs Ruby with Miles, a fellow student, for a duet at the upcoming choir recital. Their first performance goes poorly as they each prepare separately; Mr. V insists that they get together on their own to practice. Ruby invites Miles to her house to practice, but they are interrupted by Frank and Jackie loudly having sex in the next room over. Ruby later hears classmates in the cafeteria mocking the incident behind her back; Miles apologizes for spreading the story, but she wants nothing to do with him. She eventually forgives him and they resume their practice while kindling a relationship.
Meanwhile, Frank and Leo struggle to make ends meet with the fishing business as new fees and sanctions are imposed by the local board. At a board meeting, Frank stands and announces that he is starting his own company to get around the new restrictions and sell his fish on his own, inviting other local fishermen to join in. The family struggles to get the company off the ground, relying on Ruby to talk to people and spread the word.
Mr. V encourages Ruby to audition for Berklee College of Music and offers her private lessons to prepare. Ruby joins him for the lessons, but becomes increasingly busy helping her family with the business. Mr. V grows irritated with her constantly being late and making excuses, canceling their lessons. He chastises her for wasting his time and accuses her of not caring enough about music.
While fishing one day, Frank and Leo are intercepted by the Coast Guard after failing to respond to ship horns and radio calls. They are fined and have their fishing licenses revoked for their negligence; they appeal and manage to get their license back on the condition that they have a hearing person on board with them at all times. Ruby announces to the family that she is foregoing college and will join the business full-time. Her parents are supportive, but Leo reacts angrily, insisting that they can manage their own problems without Ruby's help.
Ruby's family attends her choir recital, and while they cannot hear her sing, they notice the positive reception from the audience around them. That night, Frank asks Ruby to sing a song for him while he feels her vocal cords, growing emotional. The entire family then drives to Boston with Ruby for her Berklee audition; they are not supposed to enter the audition hall, but they sneak up to the balcony to watch anyway. Ruby is nervous at first but gains confidence when she sees her family; she signs along with the song so they can understand what she is saying.
Some time later, Ruby is accepted to Berklee; she shares the news with her family and Mr. V, who are all excited for her, before asking Miles to visit her in Boston sometime. Meanwhile, the hearing workers in the family's fishing business have been learning sign language, allowing them to interpret for the family. Ruby's friend Gertie drives her to Boston for college as her family sees them off; Ruby signs "I love you" to them as they drive away.
'''

# 한번만 수행해도 됨 #########
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
##############################

stopWords = set(stopwords.words("english"))

stopWords.add('ruby')
stopWords.update(['ruby', 'leo', 'frank'])
#print(stopWords)

lemma = WordNetLemmatizer()

onlyEnglish = re.sub(r"[^a-zA-Z]+", " ", text_data)
print(onlyEnglish)

print("===")

english_word_token = word_tokenize(onlyEnglish.lower())
print(english_word_token)


tagged_english_word_token = pos_tag(word_tokenize(onlyEnglish.lower()))
print(tagged_english_word_token)

'''
only_noun_english_word_token = list()
for word in tagged_english_word_token:
  if word[1] == 'NN':
    only_noun_english_word_token.append(word[0])
print(only_noun_english_word_token)

english_word_token = only_noun_english_word_token
'''

english_word_token_stop = list()
for word in english_word_token:
  if word not in stopWords:
    english_word_token_stop.append(word)

print(english_word_token_stop) 


In [None]:
count_result = Counter(english_word_token_stop)
print(count_result)

word_count = dict()

for tag, counts in count_result.most_common(30):
    if(len(str(tag))>3):
        word_count[tag] = counts
        print("%s : %d" % (tag, counts))

In [None]:
# 히스토그램 표시 

plt.figure(figsize=(12,5))
plt.xlabel("word")
plt.ylabel("count")
plt.grid(True)

sorted_Keys = sorted(word_count, key=word_count.get, reverse=True)
sorted_Values = sorted(word_count.values(), reverse=True)

plt.bar(range(len(word_count)), sorted_Values, align='center')
plt.xticks(range(len(word_count)), list(sorted_Keys), rotation='85')

plt.show()

In [None]:
# Word Cloud

stopwords = set(STOPWORDS)
word_cloud = WordCloud(background_color='ivory', stopwords=stopwords, width=800, height=600)
cloud = word_cloud.generate_from_frequencies(word_count)

plt.figure(figsize=(8,8))
plt.imshow(cloud)
plt.axis('off')
plt.show()

## 웹크롤링(Web Crawling)

In [None]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.yonsei.ac.kr/sc/support/corona_notice.jsp?mode=view&article_no=198507&board_wrapper=%2Fsc%2Fsupport%2Fcorona_notice.jsp&pager.offset=0&board_no=752'

response = requests.get(url)

if response.status_code == 200:
  html = response.text
  soup = BeautifulSoup(html, 'html.parser')

  content = soup.select_one('#jwxe_main_content > div.jwxe_board > div > dl > dd > div.cont_area')
  print(content.get_text())

else : 
  print(response.status_code)


## 웹크롤링(Web Crawling) + 워드크라우드(Word Cloud)

In [None]:
import requests
from bs4 import BeautifulSoup

from collections import Counter

import pandas as pd
import re

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag

import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS


url = 'https://www.imdb.com/title/tt10366460/plotsummary?ref_=tt_stry_pl'
#url = 'https://www.imdb.com/title/tt3480822/plotsummary?ref_=tt_stry_pl'
#url = 'https://www.imdb.com/title/tt9376612/plotsummary?ref_=tt_stry_pl'
#url = 'https://www.imdb.com/title/tt2294629/plotsummary?ref_=tt_stry_pl'

response = requests.get(url)

if response.status_code == 200:
  html = response.text
  soup = BeautifulSoup(html, 'html.parser')

  content = soup.select_one('#plot-synopsis-content')
#  print(content.get_text())

else : 
  print(response.status_code)


stopWords = set(stopwords.words("english"))

stopWords.add('ruby')
stopWords.update(['ruby', 'leo', 'frank'])
stopWords.update(['romanoff', 'vostokoff', 'belova', 'dreykov', 'dreykov', 'wenwu', 'shang', 'anna', 'elsa', 'kristoff', 'duke', 'olaf'])

lemma = WordNetLemmatizer()

onlyEnglish = re.sub(r"[^a-zA-Z]+", " ", content.get_text())

print("===")

english_word_token = word_tokenize(onlyEnglish.lower())
#print(english_word_token)


tagged_english_word_token = pos_tag(word_tokenize(onlyEnglish.lower()))
#print(tagged_english_word_token)

only_noun_english_word_token = list()
for word in tagged_english_word_token:
  if word[1] == 'NN':
    only_noun_english_word_token.append(word[0])
#print(only_noun_english_word_token)

english_word_token = only_noun_english_word_token

english_word_token_stop = list()
for word in english_word_token:
  if word not in stopWords:
    english_word_token_stop.append(word)

count_result = Counter(english_word_token_stop)
#print(count_result)

word_count = dict()

for tag, counts in count_result.most_common(30):
    if(len(str(tag))>3):
        word_count[tag] = counts
        print("%s : %d" % (tag, counts))

stopwords = set(STOPWORDS)
word_cloud = WordCloud(background_color='ivory', stopwords=stopwords, width=800, height=600)
cloud = word_cloud.generate_from_frequencies(word_count)

plt.figure(figsize=(8,8))
plt.imshow(cloud)
plt.axis('off')
plt.show()


