pandas 모듈에서는 dataframe을 선언하는 것이 매우 중요하다. 올바른 형태를 가진 dataframe을 가진 상태에서만 그 이후의 행동이 가능해진다. 
코딩을 하다보면, pandas에서 서로 다른 두 dataframe을 합쳐야 하는 경우를 매우 흔하게 볼 수 있다.
이 때 사용할 수 있는 command가 총 세 가지가 있다. 
1) concatenate - 두 개의 dataframe을 위아래로 이어붙인다고 생각하면 된다. 
2) join - join 은 두 개의 df를 양 옆으로 이어붙인다. 
3) merge - merge() 함수는 서로 다른 두 데이터프레임을 각 데이터에 존재하는 고유값을 기준으로 병합할 때 사용한다. 
pd.merge(df_left,df_right,how='inner', on = None)

In [6]:
# This is a notebook for the coding implementation.
import pandas as pd 
import numpy as np 

dict_a = {
    'id1': [1, 2, 3, 4, 5],
    'name': ['a', 'b', 'c', 'd', 'e'],
    'price': [10, 20, 30, 40, 50]
}

dict_b = {
    'id2': [1, 2, 3, 4],
    'name': ['a', 'b', 'z', 'z'],
    'price': [10, 20, 100, 100]
}

df_a = pd.DataFrame(dict_a)
df_b = pd.DataFrame(dict_b)

print(dict_a)
print(dict_b)

df_test = df_a.join(df_b,lsuffix ='l_',rsuffix ='r_')
print(df_test)


{'id1': [1, 2, 3, 4, 5], 'name': ['a', 'b', 'c', 'd', 'e'], 'price': [10, 20, 30, 40, 50]}
{'id2': [1, 2, 3, 4], 'name': ['a', 'b', 'z', 'z'], 'price': [10, 20, 100, 100]}
   id1 namel_  pricel_  id2 namer_  pricer_
0    1      a       10  1.0      a     10.0
1    2      b       20  2.0      b     20.0
2    3      c       30  3.0      z    100.0
3    4      d       40  4.0      z    100.0
4    5      e       50  NaN    NaN      NaN


In [7]:
# This cell is about pandas.apply and lambda. 
import numpy as np 
import pandas as pd

df = pd.DataFrame([[4,9],[1,4],[5,6]],columns = ['A','B'])
print(df)

# Let's say we want to apply certain function on the column A. 
def plus_one(x):
    x += 1
    return x

df['A'] = df['A'].apply(plus_one)
print(df)

# Motivation - how do we simplify this process?
# Initialize the dataframe
df = pd.DataFrame([[4,9],[1,4],[5,6]],columns = ['A','B'])
df = df.apply(lambda a : a+1 )
print(df)

   A  B
0  4  9
1  1  4
2  5  6
   A  B
0  5  9
1  2  4
2  6  6
   A   B
0  5  10
1  2   5
2  6   7


In [8]:
# This cell is about string.maketrans()
import string

obj = 'python'
before = 'ython'
after = 'conda'
sen = obj.maketrans(before,after)
print(obj.translate(sen)) # This function is substituting the string with another string.
# The prerequisite for this function is the before and after should have the same length.

pconda


In [9]:
# This cell is about re.sub() function. 
# Example of this function is as follows. 
import re 

text = 'I like abple and abple'
text_mod = re.sub('abple','apple',text)

print(text_mod) # So this function is replacing certain words with another words.

I like apple and apple


In [10]:
# This cell is about removing the repeated characters.
import re # Regular expression 
text = 'Publish or Periiish'
text_sub = re.sub(r'(.)1+',r'1',text) # +1 means that if the string is repeated more than 1 time, it is removed. 
#text_sub = re.sub(r'(.)1+',r'1',text)
print(text_sub)

Publish or Periiish


https://cosmosproject.tistory.com/180
The site above explains about the syntax of the re.sub() function.
https://www.nextree.co.kr/p4327/
This site contains nice reference for regular expression.

In [11]:
import nltk
from nltk.tokenize import RegexpTokenizer
# Regular expression is very useful for text processing.
# But, it will take some time to be familiar with re.
text = 'Publish or Perish!'
#tokenizer = RegexpTokenizer(r"w+", gaps = True)
tokenizer = RegexpTokenizer(r'\w+')
# For the second argument, we can specify the criteria for the tokenization.
print(tokenizer.tokenize(text))

['Publish', 'or', 'Perish']


In [12]:
# In this cell, I tried to observe the difference between stemming and lemmatizing.
import nltk
nltk.download('wordnet')
from nltk import WordNetLemmatizer
import os

stemmizer = nltk.PorterStemmer()
text = 'The greatest glory in living lies not in never falling, but in rising every time we fall.'
text = tokenizer.tokenize(text)

text_stem = [stemmizer.stem(word) for word in text]
# In order to use for loop, we need to put those in the list.
print((text_stem))
# The result is not 100% accurate.
# How about lemmatizer?
lemmatizer = nltk.WordNetLemmatizer()
text_lem = [lemmatizer.lemmatize(word) for word in text]
print(text_lem)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\every\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['the', 'greatest', 'glori', 'in', 'live', 'lie', 'not', 'in', 'never', 'fall', 'but', 'in', 'rise', 'everi', 'time', 'we', 'fall']
['The', 'greatest', 'glory', 'in', 'living', 'lie', 'not', 'in', 'never', 'falling', 'but', 'in', 'rising', 'every', 'time', 'we', 'fall']


In [13]:
# From this cell, I will practice the modules and the functions needed for vectorization. 
# Enumerate
courses = ['NERS441','NERS444','NERS442','NERS561','NERS544']
print(list(enumerate(courses)))
# The function enumerate will return the index and the element at the same time.
# return : [(0, 'NERS441'), (1, 'NERS444'), (2, 'NERS442'), (3, 'NERS561'), (4, 'NERS544')]
for n, course in enumerate(courses): # returns index first and then the element.
    print(f'course : {course}, number : {n}')

[(0, 'NERS441'), (1, 'NERS444'), (2, 'NERS442'), (3, 'NERS561'), (4, 'NERS544')]
course : NERS441, number : 0
course : NERS444, number : 1
course : NERS442, number : 2
course : NERS561, number : 3
course : NERS544, number : 4


In [14]:
# Integer encoding process is usually done based on the frequency of each word.
article = 'Nuclear Bombs save half the lives of Traditional Nuclear Bombs because their radioactive components have half the half life of full half life bombs. This is why science works.'
article = tokenizer.tokenize(article)
article = [stemmizer.stem(word) for word in article]
article = [lemmatizer.lemmatize(word) for word in article]
#print(article)

vocab = [] # Empty array for integer encoding
#vocab = list(enumerate(article))

#for n, name in enumerate(article): # n is the index, name is the word.
    #print(f'word : {name}, number : {n}')
    #frequency = 1
    #if name not in vocab: # New words
    #    vocab.append(name)
    
#print(vocab)

In [15]:
# Padding process 
# Zero padding
# The objective of padding is to make all the arrays to have same length.

In [16]:
# One hot encoding 
# One hot encoding is inefficient in terms of memory. 
# word2vec encoding - This reflects the similarity in the encoding process.
from sklearn.feature_extraction.text import TfidfVectorizer
text = ['The greatest glory in living lies not in never falling, but in rising every time we fall.',
        'Nuclear Bombs save half the lives of Traditional Nuclear Bombs because their radioactive components have half the half life of full half life bombs. This is why science works.']
vectorizer = TfidfVectorizer()
vectorizer.fit(text) # Input is a collection of documents.
print(type(text)) # Input should be list type.
# From 'fit' command, the machine is going to study what kind of vocabs are in the data.
vectorizer.vocabulary_ # Print the vocab dictionary
sorted(vectorizer.vocabulary_.items()) # This will sort the vocab dictionary.

<class 'list'>


[('because', 0),
 ('bombs', 1),
 ('but', 2),
 ('components', 3),
 ('every', 4),
 ('fall', 5),
 ('falling', 6),
 ('full', 7),
 ('glory', 8),
 ('greatest', 9),
 ('half', 10),
 ('have', 11),
 ('in', 12),
 ('is', 13),
 ('lies', 14),
 ('life', 15),
 ('lives', 16),
 ('living', 17),
 ('never', 18),
 ('not', 19),
 ('nuclear', 20),
 ('of', 21),
 ('radioactive', 22),
 ('rising', 23),
 ('save', 24),
 ('science', 25),
 ('the', 26),
 ('their', 27),
 ('this', 28),
 ('time', 29),
 ('traditional', 30),
 ('we', 31),
 ('why', 32),
 ('works', 33)]

In [17]:
# This cell is about the understanding of the vectorizer. 
import numpy as np 
import pandas as pd 
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

corpus_train = ["this is the first document",
"this document is the second document",
"And this is the third one.",
"Is this the fourth published document?", "publish or perish","publish or perish"]
# print(corpus_train)

vectorizer = CountVectorizer()
vectorizer.fit(corpus_train) # By this command, it will provide a dictionary of the vocabularies.

X = vectorizer.fit_transform(corpus_train) # Vectorizer will make the text into the vector
vectorizer.vocabulary_
print(vectorizer.get_feature_names_out(corpus_train))
X.toarray()
# From this array, we notice that the number of nonzero elements are the same as the number of the words in the sentence.
# the element 2 means that the term appeared 2 times in a single document.
# We noticed that the words are sorted based on the alphabet sequence.

['and' 'document' 'first' 'fourth' 'is' 'one' 'or' 'perish' 'publish'
 'published' 'second' 'the' 'third' 'this']


array([[0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1],
       [0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1],
       [1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1],
       [0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1],
       [0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0]], dtype=int64)

In [18]:
test_data = ["work hard in every waking hour, that is how to success. work to success!","Work 1,work 2,work 3"]
vectorizer.fit(test_data)
result = vectorizer.transform(test_data)
print(result.toarray())

Doc_Term_Matrix = pd.DataFrame(result.toarray(),columns = vectorizer.get_feature_names_out())
print(Doc_Term_Matrix) 
# Note that in the countvectorizer, it only counts the number of the term in a single sentence.
# By this simple experiment, I think the vectorizer is doing pre-processing on its own.

[[1 1 1 1 1 1 2 1 2 1 2]
 [0 0 0 0 0 0 0 0 0 0 3]]
   every  hard  hour  how  in  is  success  that  to  waking  work
0      1     1     1    1   1   1        2     1   2       1     2
1      0     0     0    0   0   0        0     0   0       0     3


In [22]:
# This cell is about Tfidf vectorizer
# This will convert text into tfidf feature.
from sklearn.feature_extraction.text import TfidfTransformer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?'
]
vectorizer = TfidfVectorizer(min_df=0.1)
X = vectorizer.fit_transform(corpus)
print(X)
print(X.shape)
# Learn the vocaboularies, tf and idf 
# Returns a document term matrix.
vectorizer.get_feature_names_out()
Doc_Term_Matrix = pd.DataFrame(X.toarray(),columns = vectorizer.get_feature_names_out())
print(Doc_Term_Matrix) 
print(Doc_Term_Matrix.shape)
# This will be the tf of each term.
# Note that tf is computed as the frequency within a document.
vectorizer.idf_ # It will show you the inverse document frequency.

  (0, 1)	0.46979138557992045
  (0, 2)	0.5802858236844359
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 8)	0.38408524091481483
  (1, 5)	0.5386476208856763
  (1, 1)	0.6876235979836938
  (1, 6)	0.281088674033753
  (1, 3)	0.281088674033753
  (1, 8)	0.281088674033753
  (2, 4)	0.511848512707169
  (2, 7)	0.511848512707169
  (2, 0)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 3)	0.267103787642168
  (2, 8)	0.267103787642168
  (3, 1)	0.46979138557992045
  (3, 2)	0.5802858236844359
  (3, 6)	0.38408524091481483
  (3, 3)	0.38408524091481483
  (3, 8)	0.38408524091481483
(4, 9)
        and  document     first        is       one    second       the  \
0  0.000000  0.469791  0.580286  0.384085  0.000000  0.000000  0.384085   
1  0.000000  0.687624  0.000000  0.281089  0.000000  0.538648  0.281089   
2  0.511849  0.000000  0.000000  0.267104  0.511849  0.000000  0.267104   
3  0.000000  0.469791  0.580286  0.384085  0.000000  0.000000  0.384085   

      third      this  
0  0.0

array([1.91629073, 1.22314355, 1.51082562, 1.        , 1.91629073,
       1.91629073, 1.        , 1.91629073, 1.        ])

In [23]:
# This cell will explain what hashing vectorizer is. 
from sklearn.feature_extraction.text import HashingVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?'
]
# print(sample_text)
vectorizer = HashingVectorizer(n_features= 2**4)
Fit_text = vectorizer.fit_transform(corpus)
print(Fit_text) 
print(Fit_text.shape) # (4,16)
# (4,16) means that there are 4 documents, and 16 words.
# The text is vectorized.

  (0, 0)	-0.5773502691896258
  (0, 8)	-0.5773502691896258
  (0, 13)	0.5773502691896258
  (0, 14)	0.0
  (1, 0)	-0.8164965809277261
  (1, 11)	0.4082482904638631
  (1, 13)	0.4082482904638631
  (1, 14)	0.0
  (2, 4)	-0.7071067811865475
  (2, 5)	0.7071067811865475
  (2, 13)	0.0
  (2, 14)	0.0
  (3, 0)	-0.5773502691896258
  (3, 8)	-0.5773502691896258
  (3, 13)	0.5773502691896258
  (3, 14)	0.0
(4, 16)
