In [3]:
import pandas as pd
import numpy as np
import torch
import sklearn
from sklearn.metrics import classification_report
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import matplotlib.pyplot as plt 
import seaborn as sn 
import csv

In [14]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from collections import Counter

In [5]:
categories = ['rec.sport.baseball', 'rec.sport.hockey']
news_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=categories, random_state=42)


train = pd.DataFrame({'text': news_train.data, 'topic': 'sports'})

In [6]:
train

Unnamed: 0,text,topic
0,\n\nThe tribe will be in town from April 16 to...,sports
1,This game would have been great as part of a d...,sports
2,"My god, hope we don't have to put up with this...",sports
3,\n\n\nI heard it will be the Minnesota-Detroit...,sports
4,I would like to make everyone aware that in wi...,sports
...,...,...
1192,\nQuestion:\nIf a team uses 40 players in a se...,sports
1193,Does anyone have the NHL STANDINGS for March 2...,sports
1194,The Dodgers have been shopping Harris to other...,sports
1195,\n\nI'm not quite sure how these numbers are g...,sports


Since both the book and the sports dataset are around 1000 entries long, we will also shorten the movie dataset to 1000. This is to keep all the classes represented equally.

In [7]:
path_movie = "C:/Users/jacqu/Desktop/Text_Mining_Project/movie_review.csv"
path_book = "C:/Users/jacqu/Desktop/Text_Mining_Project/customer reviews.csv"

# print("Path to dataset files:", path_movie)
# print("Path to dataset files:", path_book)

In [8]:
movie_dataset = pd.read_csv(path_movie, encoding="latin1")
book_dataset = pd.read_csv(path_book, header=0, names=['w', 'a', 's', 'd' , 'q', 'text', 'r', 'y', 'u', 'v'], encoding="latin1")
movie_dataset = movie_dataset[:1000]

In [9]:
book_dataset

Unnamed: 0,w,a,s,d,q,text,r,y,u,v
0,0,The Woman in Me,Unbelievably impressive. Her torn life on paper.,Murderess Marbie,4,I'm only a third way in. Shipped lightening fa...,True,26-10-2023,"Reviewed in the United States October 26, 2023",1668009048
1,1,The Woman in Me,What a heartbreaking story,L J,5,"""There have been so many times when I was scar...",True,06-11-2023,"Reviewed in the United States November 6, 2023",1668009048
2,2,The Woman in Me,Britney you are so invincible! You are an insp...,Jamie,5,The media could not be loaded. I personally ha...,True,01-11-2023,"Reviewed in the United States November 1, 2023",1668009048
3,3,The Woman in Me,"Fast Read, Sad Story",KMG,5,I have been a fan of Britney's music since the...,True,25-10-2023,"Reviewed in the United States October 25, 2023",1668009048
4,4,The Woman in Me,"Buy it, itâs worth the read!",Stephanie Brown,5,"Whether or not youâre a fan, itâs a great ...",True,01-11-2023,"Reviewed in the United States November 1, 2023",1668009048
...,...,...,...,...,...,...,...,...,...,...
915,915,The Wonky Donkey,The best child/grandmother book ever written,Marilyn Kreienkamp,5,This is exactly the kind of book children unde...,True,02-11-2023,"Reviewed in the United States November 2, 2023",545261244
916,916,The Wonky Donkey,Great book,Amazon Customer,5,We get this book for all our grandchildren the...,True,30-10-2023,"Reviewed in the United States October 30, 2023",545261244
917,917,The Wonky Donkey,"Fun book, makes you laugh",Jennifer Tinucci,5,I saw a grandma reading this book to her grand...,True,24-10-2023,"Reviewed in the United States October 24, 2023",545261244
918,918,The Wonky Donkey,Cutest beck ever!!,Janice Easter,5,I have ordered this book over and over to give...,True,19-10-2023,"Reviewed in the United States October 19, 2023",545261244


Now we need to add the movie and the book dataset text entries to the train set with their respective labels.

In [10]:
movie_append = pd.DataFrame({'text': movie_dataset.text , 'topic': 'movie'})
book_append = pd.DataFrame({'text': book_dataset.text, 'topic': 'book'})

In [11]:
training_data = pd.concat([train, movie_append, book_append], ignore_index=True, sort=False)

In [15]:
training_data

Unnamed: 0,text,topic
0,\n\nThe tribe will be in town from April 16 to...,sports
1,This game would have been great as part of a d...,sports
2,"My god, hope we don't have to put up with this...",sports
3,\n\n\nI heard it will be the Minnesota-Detroit...,sports
4,I would like to make everyone aware that in wi...,sports
...,...,...
3112,This is exactly the kind of book children unde...,book
3113,We get this book for all our grandchildren the...,book
3114,I saw a grandma reading this book to her grand...,book
3115,I have ordered this book over and over to give...,book


In [18]:
training_data.to_csv("my_training_data.csv", index=False)