In [40]:
# Importing Modules
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from docx import Document
# Importing some required and important libararies for text processing of tweets in each data
import nltk
import re
import string
from collections import Counter
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer

import os
import pickle
import langchain
from transformers import BertTokenizer, BertModel
import torch
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DataFrameLoader
from langchain.vectorstores import FAISS
from langchain.embeddings import google_palm, GooglePalmEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate

In [2]:
# Ensuring required nltk resources are downloaded
nltk.download('punkt')  # Tokenizer
nltk.download('wordnet')  # Lemmatizer
nltk.download('stopwords')  # Stopwords

[nltk_data] Downloading package punkt to C:\Users\Aditya
[nltk_data]     Shakya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Aditya
[nltk_data]     Shakya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Aditya
[nltk_data]     Shakya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# source path
source = 'C:/Users/Aditya Shakya/OneDrive/Desktop/jtp articles/articles'

In [4]:
# Make a dataframe to store the name of each files
df = pd.DataFrame()

In [5]:
# Append the path of each files in the data
files = os.listdir(source)
file_path = []
for file in files:
    f_path = os.path.join(source, file)
    file_path.append(f_path)

In [6]:
# Path of the file
df['Path'] = file_path

In [7]:
# Title of the article
def article_title(f):
    document = Document(f)
    text = [para.text for para in document.paragraphs]
    return text[0]

# Full Article
def full_article(f):
    document = Document(f)
    text = [para.text for para in document.paragraphs]
    return ' '.join(text[1:])

# Word count of article
def word_count(article):
    return len(article.split())

# Sentence count in the article
def sentence_count(article):
    return len(article.split('.')) + len(article.split('?'))

# Caracter counts in the article
def char_count(article):
    return len(article)

In [8]:
df['Title'] = df['Path'].apply(article_title)

In [9]:
df['Article'] = df['Path'].apply(full_article)

In [10]:
df['Word counts'] = df['Article'].apply(word_count)
df['Sentence counts'] = df['Article'].apply(sentence_count)
df['Char counts'] = df['Article'].apply(char_count)

In [11]:
df.head()

Unnamed: 0,Path,Title,Article,Word counts,Sentence counts,Char counts
0,C:/Users/Aditya Shakya/OneDrive/Desktop/jtp ar...,20 Necessary Requirements of a Perfect Laptop ...,Data Science is field of computer science in w...,1393,74,8608
1,C:/Users/Aditya Shakya/OneDrive/Desktop/jtp ar...,5 Essential Tips to Improve the Readability of...,"It is, however, different from Software engine...",2846,142,16834
2,C:/Users/Aditya Shakya/OneDrive/Desktop/jtp ar...,5 Feature Selection Method from Scikit-Learn y...,Feature selection is an important part in impr...,2663,308,20931
3,C:/Users/Aditya Shakya/OneDrive/Desktop/jtp ar...,5 Frameworks for Reinforcement Learning on Python,Reinforcement learning is one of the fundament...,2224,120,14174
4,C:/Users/Aditya Shakya/OneDrive/Desktop/jtp ar...,5 Free Books to Learn Statistics for Data Science,"In addition to concepts in data science, one a...",726,30,4828


In [12]:
# Function for text preprocessing
def lower_text(text):
    # convert text to lower case
    text = text.lower()
    return text
    
# Remove stopwords
def remove_stopwords(text):
    stopwords_ = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stopwords_])
    return text

# Remove punctuation
def remove_punctuation(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Remove URLs
def remove_urls(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    return text

# Remove numbers
def remove_numbers(text):
    text = re.sub(r'\d+', '', text)
    return text

# Lemmatize text (convert words to their base form)
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(text)])
    return text

# Remove extra whitespaces
def remove_extra_whitespaces(text):
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces and leading/trailing spaces
    return text

# Combine all preprocessing steps into a single function
def preprocess_text(text):
    text = lower_text(text)
    text = remove_stopwords(text)
    text = remove_punctuation(text)
    text = remove_urls(text)
    text = remove_numbers(text)
    text = lemmatize_text(text)
    text = remove_extra_whitespaces(text)
    return text



In [13]:
df['Article'] = df['Article'].apply(preprocess_text)

In [14]:
df_copy = df.copy()

In [15]:
df_copy['Word counts'] = df_copy['Article'].apply(word_count)
df_copy['Char counts'] = df_copy['Article'].apply(char_count)

In [16]:
df_copy = df_copy.drop(['Sentence counts', 'Path'], axis = 1)

In [17]:
df_copy.head()

Unnamed: 0,Title,Article,Word counts,Char counts
0,20 Necessary Requirements of a Perfect Laptop ...,data science field computer science math stati...,812,6025
1,5 Essential Tips to Improve the Readability of...,is however different software engineering most...,1504,10918
2,5 Feature Selection Method from Scikit-Learn y...,feature selection important part improving per...,1289,10937
3,5 Frameworks for Reinforcement Learning on Python,reinforcement learning one fundamental subfiel...,1266,10048
4,5 Free Books to Learn Statistics for Data Science,addition concept data science one also need kn...,439,3524


In [18]:
# Clean Title
df_copy['Title'] = df_copy['Title'].apply(preprocess_text)

In [19]:
df_copy.head()

Unnamed: 0,Title,Article,Word counts,Char counts
0,necessary requirement perfect laptop data scie...,data science field computer science math stati...,812,6025
1,essential tip improve readability python code,is however different software engineering most...,1504,10918
2,feature selection method scikitlearn know,feature selection important part improving per...,1289,10937
3,framework reinforcement learning python,reinforcement learning one fundamental subfiel...,1266,10048
4,free book learn statistic data science,addition concept data science one also need kn...,439,3524
