In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

import os
os.chdir('/content/drive/MyDrive/major_project_cmpt_733')

Mounted at /content/drive


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

wiki_links=['https://en.wikipedia.org/wiki/List_of_Nintendo_Switch_games_(0%E2%80%939_and_A)',
           'https://en.wikipedia.org/wiki/List_of_Nintendo_Switch_games_(B)',
           'https://en.wikipedia.org/wiki/List_of_Nintendo_Switch_games_(C%E2%80%93G)',
           'https://en.wikipedia.org/wiki/List_of_Nintendo_Switch_games_(H%E2%80%93P)',
           'https://en.wikipedia.org/wiki/List_of_Nintendo_Switch_games_(Q%E2%80%93Z)']


games = []
for url in wiki_links:
    web_url = requests.get(url).text
    soup = BeautifulSoup(web_url, 'lxml')

    table_class = "wikitable plainrowheaders sortable"
    table = soup.find('table', class_=table_class)

    # Initialize variables
    index = 0
    rows = table.select('tr')

    # Loop through rows
    while index < len(rows):
        row = rows[index]

        try:
            game_info = row.select('th')
            game_name = game_info[0].find(string=True)
            game_link = game_info[0].find(href=True)['href']
        except:
            pass

        else:
            cells = row.select('td')
            attributes = []
            i = 0
            working_len = len(cells) - 1
            while i < working_len:
                attribute = cells[i].find(string=True)
                attributes.append(attribute)
                i += 1

            # Insert to list    
            row_data = [game_name, game_link] + attributes
            games.append(row_data)

        # Increment row index
        index += 1


column_names = ['Title', 'Link', 'Genre', 'Developer', 'Publisher', 'Release_JP']

game_df = pd.DataFrame(games, columns=column_names).astype(str)

game_df = game_df.replace('\n', '', regex=True)
game_df = game_df.replace(':', '', regex=True)

print('Shape of dataframe: ', game_df.shape, '\n')
game_df.head(-5)


Shape of dataframe:  (1913, 6) 



Unnamed: 0,Title,Link,Genre,Developer,Publisher,Release_JP
0,1-2-Switch,/wiki/1-2-Switch,Party,Nintendo EPD,Nintendo,"March 3, 2017"
1,10 Second Ninja X,/wiki/10_Second_Ninja_X,Action platformer,Four Circle Interactive,Thalamus Digital,"July 30, 2021"
2,13 Sentinels Aegis Rim,/wiki/13_Sentinels_Aegis_Rim,Adventure,Vanillaware,Atlus,"April 12, 2022"
3,140,/wiki/140_(video_game),Action,Carlsen Games,Carlsen Games,"January 9, 2020"
4,1979 Revolution Black Friday,/wiki/1979_Revolution_Black_Friday,Adventure,Ink Stories,Ink Stories,"August 2, 2018"
...,...,...,...,...,...,...
1903,Zen Bound 2,/wiki/Zen_Bound_2,Puzzle,Secret Exit,Secret Exit,"May 24, 2018"
1904,Ziggurat,/wiki/Ziggurat_(2014_video_game),First-person shooter,Milkstone Studios,Milkstone Studios,"July 5, 2019"
1905,Zoids Wild Blast Unleashed,/wiki/Zoids_Wild_Blast_Unleashed,Action,Eighting,JP,"February 28, 2019"
1906,Zoids Wild Infinity Blast,/wiki/Zoids_Wild_Infinity_Blast,Action,Takara Tomy,Takara Tomy,"November 26, 2020"


In [None]:
def extract_plot(soup):
    hTwo = "h2"
    text = ''
    storyNameList = ["Game", "Plot"]
    hTag = 'h'
    pTag = 'p'
    for div in soup.find_all(hTwo):
        # Check the division header
        if div.text.startswith(storyNameList[0]) or div.text.startswith(storyNameList[1]):
            # Append the header of the division to the text
            text += div.text + '\n\n'
            # Loop through the entities
            for entity in div.next_siblings:
                if entity.name and entity.name.startswith(hTag):
                    # Stop if next division header is found
                    break
                elif entity.name == pTag:
                    # Add paragraph to text
                    text += entity.text + '\n'
        else:
            pass
    return text

wiki_url = 'https://en.wikipedia.org'

# Initialize variables
index = 0
game_plots = []

# Loop through rows
while index < len(game_df):
    # Get URL
    row = game_df.iloc[index]
    url = wiki_url + row['Link']
    
    # Get website content
    web_content = requests.get(url).text
    soup = BeautifulSoup(web_content, 'lxml')
    new_text = extract_plot(soup)
    # Check none condition
    if not new_text:
        game_plots.append(None)
    else:
        game_plots.append(new_text)
    
    # Increment index
    index += 1


In [None]:
for each_plot in game_plots:
  print(each_plot)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Gameplay[edit]

Love is a 2D platformer. The player character (known as fiveEight, as revealed in kuso) runs through several linear levels. They possess three abilities: jumping, moving left and right, and leaving a checkpoint behind at the push of a button. The player has 100 lives to play through 20 levels, in which the goal is to reach the end point in each to progress to the next.[1]

Love+ nearly completely revamped the original game. It added three new game modes: "Easy Mode", which gives the player unlimited lives, "YOLO Mode", in which the player has only one life, and "Speedrun Mode", in which the goal is to finish the game in the shortest possible time. The original mode was renamed to "Arcade Mode". Many of the levels in the original game were scrapped, being replaced by completely new levels, and those that were carried over were heavily altered. Overall, Love+ has fewer levels in its main campaign than Love, 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [15]:
import re

def clean_game_plots(game_plots):
    """
    Clean the game plots by removing text within square brackets,
    replacing line breaks with spaces, and removing common prefix phrases.
    """
    cleaned_plots = []
    prefixes = ['Gameplay ', 'Game-play ', 'Plot ']
    for content in game_plots:
        if content is not None:
            # Remove text within square brackets
            content = re.sub(r'\[.*?\]+', '', content)
            # Replace line breaks with space
            content = content.replace('\n', ' ')
            # Remove common prefix phrases
            for prefix in prefixes:
                content = content.replace(prefix, '')
            cleaned_plots.append(content)
        else:
            cleaned_plots.append(None)
    return cleaned_plots

game_df['Plots'] = clean_game_plots(game_df['Plots'])
game_df

Unnamed: 0,Title,Link,Genre,Developer,Publisher,Released Date,Plots
0,1-2-Switch,/wiki/1-2-Switch,Party,Nintendo EPD,Nintendo,"March 3, 2017",1-2-Switch is a party game in which players d...
1,10 Second Ninja X,/wiki/10_Second_Ninja_X,Action platformer,Four Circle Interactive,Thalamus Digital,"July 30, 2021",10 Second Ninja X is a sidescrolling puzzle p...
2,13 Sentinels Aegis Rim,/wiki/13_Sentinels_Aegis_Rim,Adventure,Vanillaware,Atlus,"April 12, 2022",13 Sentinels: Aegis Rim is a video game where...
3,140,/wiki/140_(video_game),Action,Carlsen Games,Carlsen Games,"January 9, 2020","As described by Carlsen, 140 is ""an old schoo..."
4,1979 Revolution Black Friday,/wiki/1979_Revolution_Black_Friday,Adventure,Ink Stories,Ink Stories,"August 2, 2018",
...,...,...,...,...,...,...,...
1908,Zombie Army Trilogy,/wiki/Zombie_Army_Trilogy,Tactical shooter,Rebellion Developments,Rebellion Developments,"March 31, 2020",Zombie Army Trilogy utilises similar mechanic...
1909,Zombie Driver Immortal Edition,/wiki/Zombie_Driver_Immortal_Edition,Vehicular combat,Exor Studios,JP,"July 25, 2019",
1910,Zombieland Double Tap – Road Trip,/wiki/Zombieland_Double_Tap_%E2%80%93_Road_Trip,Twin-stick shooter,High Voltage Software,,"October 15, 2019",
1911,Zombies Ate My Neighbors,/wiki/Zombies_Ate_My_Neighbors,Run and gun,Dotemu,Lucasfilm Games,"June 29, 2021",The mad scientist Dr. Tongue has created a wi...


In [16]:
game_df = game_df[game_df.Title != 'Untitled']

# Rename columns
rename = {'Release_JP': 'Released Date'}
game_df.rename(columns=rename, inplace=True)        
                

In [17]:
game_df.head(-5)
game_df.columns

Index(['Title', 'Link', 'Genre', 'Developer', 'Publisher', 'Released Date',
       'Plots'],
      dtype='object')

In [18]:
game_df.to_csv('/content/drive/MyDrive/major_project_cmpt_733/wikipedia.csv')      