Loading dataset


In [1]:
from google.colab import drive
import pandas as pd

drive.mount('/content/drive')

# File path to your CSV file
file_path = '/content/drive/My Drive/enwiki-20170820.csv'

# Load the data
data = pd.read_csv(file_path, nrows=500)  # Load only the first 100 rows



# Print the loaded data
print(data)


Mounted at /content/drive
     ARTICLE_ID                 TITLE                 SECTION_TITLE  \
0             0             Anarchism                  Introduction   
1             0             Anarchism     Etymology and terminology   
2             0             Anarchism                       History   
3             0             Anarchism  Anarchist schools of thought   
4             0             Anarchism   Internal issues and debates   
..          ...                   ...                           ...   
495          42                 Algae                External links   
496          43  Analysis of variance                  Introduction   
497          43  Analysis of variance                       History   
498          43  Analysis of variance            Motivating example   
499          43  Analysis of variance    Background and terminology   

                                          SECTION_TEXT  
0    \n\n\n\n\n\n'''Anarchism''' is a political phi...  
1    \n

Preprocessing of dataset

In [3]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
from textblob import TextBlob
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


# Keep only the 'ARTICLE_ID' and 'SECTION_TEXT' columns
data = data[['ARTICLE_ID', 'SECTION_TEXT']]

# Function for basic text cleaning
def clean_text(text):
    # Remove HTML tags
    clean_text = re.sub(r'<.*?>', '', text)
    # Convert text to lowercase
    clean_text = clean_text.lower()
    return clean_text

# Apply basic text cleaning to 'SECTION_TEXT' column
data['cleaned_text'] = data['SECTION_TEXT'].apply(clean_text)

# Tokenize each entry in the 'cleaned_text' column
data['tokens'] = data['cleaned_text'].apply(word_tokenize)

# Get the list of English stopwords
stop_words = set(stopwords.words('english'))

# Function to remove stopwords from a list of tokens
def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

# Remove stopwords from the tokens
data['tokens_without_stopwords'] = data['tokens'].apply(remove_stopwords)

# Initialize PorterStemmer for stemming
stemmer = PorterStemmer()

# Initialize WordNetLemmatizer for lemmatization
lemmatizer = WordNetLemmatizer()

# Function to perform stemming on a list of tokens
def stem_tokens(tokens):
    return [stemmer.stem(token) for token in tokens]

# Function to perform lemmatization on a list of tokens
def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

# Apply stemming to the tokens without stopwords
data['stemmed_tokens'] = data['tokens_without_stopwords'].apply(stem_tokens)

# Apply lemmatization to the tokens without stopwords
data['lemmatized_tokens'] = data['tokens_without_stopwords'].apply(lemmatize_tokens)

# Print the preprocessed data
print(data[['ARTICLE_ID', 'lemmatized_tokens']])
print(data[['SECTION_TEXT','lemmatized_tokens']])




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


     ARTICLE_ID                                  lemmatized_tokens
0             0  ['', 'anarchism, '', ', political, philosophy,...
1             0  [term, ``, anarchism, '', compound, word, comp...
2             0  [===origins===, woodcut, digger, document, wil...
3             0  [portrait, philosopher, pierre-joseph, proudho...
4             0  [consistent, anarchist, value, controversial, ...
..          ...                                                ...
495          42  [*, –, database, algal, name, including, image...
496          43  ['', 'analysis, variance, '', ', (, ``, 'anova...
497          43  [analysis, variance, reached, fruition, 20th, ...
498          43  [fit.fair, fitvery, good, fitthe, analysis, va...
499          43  [anova, particular, form, statistical, hypothe...

[500 rows x 2 columns]
                                          SECTION_TEXT  \
0    \n\n\n\n\n\n'''Anarchism''' is a political phi...   
1    \n\nThe term ''anarchism'' is a compound word ...  

In [4]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# File path for the new file
output_file_path = '/content/drive/My Drive/preprocessed_data.csv'

# Save the preprocessed data to a new CSV file
data.to_csv(output_file_path, index=False)

print("Preprocessed data saved successfully to:", output_file_path)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Preprocessed data saved successfully to: /content/drive/My Drive/preprocessed_data.csv
