# TF_IDF + MNLI

https://wikidocs.net/127853

https://huggingface.co/roberta-large-mnli

In [None]:
%%capture
!pip install transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

good_dataset_path = "/content/drive/MyDrive/AI_Project_14/good_audio.csv"
bad_dataset_path = "/content/drive/MyDrive/AI_Project_14/bad_audio.csv"

Mounted at /content/drive


In [None]:
from gensim.parsing.preprocessing import preprocess_string
import csv

def read_csv_file(file_path):
    with open(file_path, 'r', newline='') as file:
        csv_reader = csv.reader(file)
        data = list(csv_reader)
    return data

good_data = read_csv_file(good_dataset_path)  # Read good dataset from CSV file
bad_data  = read_csv_file(bad_dataset_path)   # Read bad  dataset from CSV file

pre_good_docs = []  # List to store preprocessed good documents
pre_bad_docs  = []  # List to store preprocessed bad  documents

n_good_docs = []  # List to store good documents
n_bad_docs  = []  # List to store bad  documents

for idx in range(90):
    # Preprocess the script text using the gensim preprocessing function
    pre_good_docs.append(" ".join(preprocess_string(good_data[idx + 1][2])))
    pre_bad_docs.append(" ".join(preprocess_string(bad_data[idx + 1][2])))

    # Alternatively, if you don't want to preprocess the script text, you can use the following lines:
    n_good_docs.append(good_data[idx + 1][2])
    n_bad_docs.append(bad_data[idx + 1][2])

# Concatenate good_docs and bad_docs to create the final list of all documents
pre_docs = pre_good_docs + pre_bad_docs
n_docs = n_good_docs+ n_bad_docs

In [None]:
print(pre_good_docs[0])
print(n_good_docs[0])
print(pre_bad_docs[0])
print(n_bad_docs[0])

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vect = CountVectorizer()  # You can specify the vocabulary using 'vocabulary=vocab' or leave it unspecified

# Transform the document collection into a document-term matrix
dtm = vect.fit_transform(pre_docs).toarray()

# Get the vocabulary (feature names)
vocab = vect.get_feature_names_out()

# Create a pandas DataFrame to display the document-term matrix
df = pd.DataFrame(dtm, columns=vocab)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer with specified vocabulary
tfidfv = TfidfVectorizer(vocabulary=vocab)

# Transform the document collection into a TF-IDF matrix
tfidf = tfidfv.fit_transform(pre_docs).toarray()

# Get the feature names (vocabulary)
vocab = tfidfv.get_feature_names_out()

# Create a pandas DataFrame to display the TF-IDF matrix
df = pd.DataFrame(tfidf, columns=vocab)

In [None]:
import numpy as np

# Create an array of the indices that would sort each row of the TF-IDF matrix in descending order
tfidf_order = np.flip(np.argsort(tfidf, axis=-1), axis=-1)

topics = []  # List to store the top terms for each document
n_top = 5  # Number of top terms to retrieve for each document

# Iterate over each row in tfidf_order
for line in tfidf_order:
    # Retrieve the top n_top terms for the current document
    topic = [vocab[x] for x in line[:n_top]]
    topics.append(topic)

# Create a pandas DataFrame to display the top terms for each document
df = pd.DataFrame(topics)
df

Unnamed: 0,0,1,2,3,4
0,bibl,adulter,stone,ritual,year
1,lakota,reserv,hill,indian,knee
2,ammonia,refriger,toxic,pressur,work
3,machin,want,start,stink,song
4,dodo,falcon,object,maltes,prop
...,...,...,...,...,...
175,humor,smack,lip,shootout,machet
176,jim,ian,tuesdai,carri,steven
177,valentin,blue,theatr,sort,relationship
178,vampir,kidnap,priest,girl,movi


In [None]:
df.iloc[0][:3]

0       bibl
1    adulter
2      stone
Name: 0, dtype: object

In [None]:
from transformers import pipeline

classifier = pipeline('zero-shot-classification', model='roberta-large-mnli')

In [None]:
idx = 0
a = []  # List to store the indices
b = []  # List to store the scores
c = []  # List to store the counts
d = []  # List to store the labels

for _ in n_docs:
    sequence_to_classify = n_docs[idx]
    candidate_labels = ' '.join(df.iloc[idx][:3])
    score = classifier(sequence_to_classify, candidate_labels)['scores'][0]

    words = sequence_to_classify.split()
    word_count = len(words)

    idx += 1
    if idx <= 90:
      label = 1
    else:
      label = 0

    a.append(idx)
    b.append(score)
    c.append(word_count)
    d.append(label)

In [None]:
median_score = np.median(b)
print("md :", median_score)

gsum = 0
for i in range(0, 90):
  gsum += b[i]

bsum = 0
for i in range(90, 180):
  bsum += b[i]

print(gsum / 90)
print(bsum / 90)

md : 0.45228736102581024
0.4120017279146446
0.508214855276876


In [None]:
# CSV 파일에 저장할 데이터를 리스트로 구성
data = list(zip(a, b, c, d))

# CSV 파일 경로
file_path = '/content/drive/MyDrive/my_train_data.csv'

# CSV 파일 저장
with open(file_path, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['idx', 'score', 'count', 'label'])  # 첫 번째 행에 열 이름 쓰기
    writer.writerows(data)  # 데이터 쓰기