<div class="alert alert-block alert-info">
    <h2 align="center">Trend Extraction from News</h2>
    <h3 align="center">Zarebin Search Engine - Entrance Project for Internship</h3>
    <h4 align="center"><a href="https://t.me/afsharino">Mohammad Afshari</a></h4>
</div>

<style>
.aligncenter {
    text-align: center;
}
</style>
<p class="aligncenter">
    <img src = "../images/zarebin.png"  height=400 width= 750>
</p>

# Import Libraries

In [1]:
# Scientific 
import numpy as np
import pandas as pd

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Parse html
import base64
from bs4 import BeautifulSoup

# Concurrecy
import multiprocessing as mp
import asyncio

# Pre and postproccessing 
from hazm import Normalizer, WordTokenizer, Lemmatizer
from hazm.utils import stopwords_list
import re 

# data and time
import khayyam

# Others
import os
import glob
from tqdm.notebook import tqdm
from collections import Counter


# Look for Datasets

In [2]:
print(f'Current working directory: {os.getcwd()}')
path = '/home/afsharino/Desktop/Trend-extraction-from-news/datasets'
print(f'files in dataset directory are: {os.listdir(path)}')

Current working directory: /home/afsharino/Desktop/Trend-extraction-from-news/src
files in dataset directory are: ['asriran_all.csv', 'mehrnews_all.csv', 'khabaronline_all.csv', 'yjcnews_all.csv', 'tasnimnews_all.csv', 'farsnews_all.csv', 'varzesh3_all.csv', 'isna_all.csv', 'entekhab_all.csv', 'anapress_all.csv', 'shana_all.csv', 'jahannews_all.csv', 'mashreghnews_all.csv', '.~lock.entekhab_all.csv#', 'iscanews_all.csv']


# Load Datasets

In [8]:
class DataReader:
    def __init__(self, data=None):
        print('DataReader called...')
        self.data = data
    
    # This method reads all csv files in given path
    def read_dataset(self, path:str) -> None:
        print(f'Reading datasets...')
        
        try:
            data = dict()
            for file in tqdm(glob.glob(path)):
                # Get csv file's name
                file_name = file.split('/')[-1].split('_')[0]
        
                # Read csv file and save in dictionary
                data[file_name] = pd.read_csv(file,nrows=20)
                print(f'data with name {file_name} saved successfully :)')
            
            self.data = data
            print(f'Process of reading datasets completed successfully :)')
        except Exception as e:
            print(f'Unfortunately Process of reading datasets failed :(')
            print(f'Error Raised:{e}')
    
    # This method decodes the html source to browser format
    def decode_html_file(self, html_source:str) -> str:
        try:
            # Decode html source to utf-8
            html_doc = base64.b64decode(html_source).decode('utf-8')
            return html_doc
        
        except Exception as e:
            print(f'Error raised: Can not decode the file because: {e}')      
            
    # This method Parses the html doc to soup object
    def parse_html_doc(self, html_doc:str) -> BeautifulSoup:
        try:
            # Parse html to soup object
            soup = BeautifulSoup(html_doc, 'html.parser')
            return soup
        
        except Exception as e:
            print(f'Error raised: Can not parse the html doc because: {e}')
    
    # This method extracts title from the news
    def get_title(self, series:pd.core.series.Series) -> str:
        # Unpack series
        html_source, url = series[0], series[1]
        
        # Decode html source
        html_doc = self.decode_html_file(html_source)
        
        # Parse html to soup
        soup = self.parse_html_doc(html_doc)    

        try:
            # Get title of the news
            return soup.title.text

        except Exception as e:
            if soup == None:
                print('Soup is None!')
                print(f'URL= {url}')
                print('--------------------\n')
                return np.nan

            elif soup.title == None:
                print('Soup.title is None!')
                print(f'URL= {url}')
                print(soup.text)

                print('--------------------\n')
                return np.nan
            else:
                print(f'Error: {e}')
    
    # This method adds extracted titles to corrosponding row in dataframes
    def add_title_to_df(self, data):
        data['title'] =  data[['html', 'url']].apply(lambda x: self.get_title(x),axis=1)
        return data
    
    # This method does the proccess of add title for all dataframes
    def add_all_titles(self) -> None:
        print(f'Start adding titles to dataframes...')
        
        for source in tqdm(self.data.keys()):
            print(f'{source} in progress...\n')
            
            self.data[source] = self.parallelize(self.data[source])
            
            print(f'{source} titles added successfully :)')
            print(f'---------------------------------------------------------------------\n\n')
            
    # This method extracts dates from the news
    def get_date(self, series:pd.core.series.Series, source:str) -> str:
        # Unpack series
        html_source, url = series[0], series[1]

        # Decode html source
        html_doc = self.decode_html_file(html_source)
        
        # Parse html to soup
        soup = self.parse_html_doc(html_doc) 
        
        #------------------ asriiran ------------------
        if source == 'asriran':
            try:
                # Get date out of the source
                date = soup.find_all('div', {'class' : 'news_nav header_pdate'})[0].\
                text.split("                    ")[2].strip(" ")
                return date

            except Exception as e:
                if soup == None:
                    print('Soup is None!')
                    print(f'URL= {url}')
                    print('--------------------\n')
                    return np.nan

                elif not soup.find_all('div', {'class' : 'news_nav header_pdate'}):
                    print('soup.find_all is empty!')
                    print(f'URL= {url}')
                    print(soup.text)
                    print('--------------------\n')
                    return np.nan
                else:
                    print(f'Error: {e}')
                    
        #------------------ mehrnews ------------------
        if source == 'mehrnews':
            try:
                # Get date out of the source
                date = soup.find('div', {'class': 'col-6 col-sm-4 item-date'}).\
                text.split('،')[0].strip('\n')
                return date

            except Exception as e:
                if soup == None:
                    print('Soup is None!')
                    print(f'URL= {url}')
                    print('--------------------\n')
                    return np.nan

                elif not soup.find('div', {'class': 'col-6 col-sm-4 item-date'}):
                    print('soup.find is empty!')
                    print(f'URL= {url}')
                    print(soup.text)
                    print('--------------------\n')
                    return np.nan
                else:
                    print(f'Error: {e}')
                    
        #------------------ khabaronline ------------------
        if source == 'khabaronline':
             try:
                # Get date out of the source
                date = soup.find('div', {'class': 'col-6 col-sm-4 item-date'}).\
                text.split('-')[0].strip('\n')
                return date

             except Exception as e:
                if soup == None:
                    print('Soup is None!')
                    print(f'URL= {url}')
                    print('--------------------\n')
                    return np.nan

                elif not soup.find('div', {'class': 'col-6 col-sm-4 item-date'}):
                    print('soup.find is empty!')
                    print(f'URL= {url}')
                    print(soup.text)
                    print('--------------------\n')
                    return np.nan
                else:
                    print(f'Error: {e}')
                    
        #------------------ yjcnews ------------------
        if source == 'yjcnews':
            try:
                # Get date out of the source
                date = soup.find_all('span', {'class': 'date-color-news'})[1].text.strip()
                return date
        
            except Exception as e:
                if soup == None:
                    print('Soup is None!')
                    print(f'URL= {url}')
                    print('--------------------\n')
                    return np.nan

                elif not soup.find_all('span', {'class': 'date-color-news'}):
                    print('soup.find_all is empty!')
                    print(f'URL= {url}')
                    print(soup.text)
                    print('--------------------\n')
                    return np.nan
                else:
                    print(f'Error: {e}')
                    
        #------------------ anapress ------------------
        if source == 'anapress':
            try:
                # Get date out of the source
                date = soup.find('div', {'class': 'news-date'}).text.split('-')[0].strip()
                return date
        
            except Exception as e:
                if soup == None:
                    print('Soup is None!')
                    print(f'URL= {url}')
                    print('--------------------\n')
                    return np.nan
        
                elif not soup.find('div', {'class': 'news-date'}):
                    print('soup.find is empty!')
                    print(f'URL= {url}')
                    print(soup.text)
                    print('--------------------\n')
                    return np.nan
                else:
                    print(f'Error: {e}')
                    
        #------------------ tasnimnews ------------------
        if source == 'tasnimnews':
            try:
                # Get date out of the source
                date = soup.find('li', {'class': 'time'}).text.split('-')[0].strip()
                return date
        
            except Exception as e:
                if soup == None:
                    print('Soup is None!')
                    print(f'URL= {url}')
                    print('--------------------\n')
                    return np.nan

                elif not soup.find('li', {'class': 'time'}):
                    if not soup.find('time'):
                        print('soup.find is empty!')
                        print(f'URL= {url}')
                        print(soup.text)
                        print('--------------------\n')
                        return np.nan
                    else:
                        date1 = soup.find('time').text.split('-')[0].strip()
                        return date1
                else:
                    print(f'Error: {e}')
                    
        #------------------ farsnews ------------------
        if source == 'farsnews':
            try:
                # Get date out of the source
                date = soup.find('time').text.split('\n')[2].strip(' \r')
                return date
        
            except Exception as e:
                if soup == None:
                    print('Soup is None!')
                    print(f'URL= {url}')
                    print('--------------------\n')
                    return np.nan

                elif not soup.find('time'):
                    if not soup.find('div', {'class': 'data-box d-flex justify-content-start align-items-center'}):
                        print('soup.find is empty!')
                        print(f'URL= {url}')
                        print(soup.text)
                        print('--------------------\n')
                        return np.nan
                    else:
                        date1 = soup.find('div', {'class': 'data-box d-flex justify-content-start align-items-center'}).\
                        find_all('span')[2].text.strip(' \r\n')
                        return date1

                else:
                    print(f'Error: {e}')
    
        #------------------ varzesh3 ------------------
        if source == 'varzesh3':
            try:
                # Get date out of the source
                date = soup.find('div', {'class': 'news-info'}).find_all('span')[1].text.split('ساعت')[0].strip()
                return date

            except Exception as e:
                if soup == None:
                    print('Soup is None!')
                    print(f'URL= {url}')
                    print('--------------------\n')
                    return np.nan

                elif not soup.find('div', {'class': 'news-info'}):
                    if not soup.find('span', {'class': 'date'}):
                        print('soup.find is empty!')
                        print(f'URL= {url}')
                        print(soup.text)
                        print('--------------------\n')
                        return np.nan
                    else:
                        date1 = soup.find('span', {'class': 'date'}).text.split('ساعت')[0].strip(' -')
                        return date1

                else:
                    print(f'Error: {e}')

        #------------------ isna ------------------
        if source == 'isna':
            try:
                # Get date out of the source
                date = soup.find('span', {'class': 'text-meta'}).text.split('/')[0].strip()
                return date

            except Exception as e:
                if soup == None:
                    print('Soup is None!')
                    print(f'URL= {url}')
                    print('--------------------\n')
                    return np.nan

                elif not soup.find('span', {'class': 'text-meta'}):
                        print('soup.find is empty!')
                        print(f'URL= {url}')
                        print(soup.text)
                        print('--------------------\n')
                        return np.nan

                else:
                    print(f'Error: {e}')

        #------------------ entekhab ------------------
        if source == 'entekhab':
                try:
                    # Get date out of the source
                    date = soup.find('div', {'class': 'news_nav news_pdate_c col-xs-36 col-sm-18'}).\
                    text.split('-')[1].strip()
                    return date

                except Exception as e:
                    if soup == None:
                        print('Soup is None!')
                        print(f'URL= {url}')
                        print('--------------------\n')
                        return np.nan

                    elif not soup.find('div', {'class': 'news_nav news_pdate_c col-xs-36 col-sm-18'}):
                            print('soup.find is empty!')
                            print(f'URL= {url}')
                            print(soup.text)
                            print('--------------------\n')
                            return np.nan

                    else:
                        print(f'Error: {e}')

        #------------------ shana ------------------
        if source == 'shana':
            try:
                # Get date out of the source
                date = soup.find('div', {'class': 'col-6 col-sm-4'}).text.split('-')[0].strip(' \n')
                return date

            except Exception as e:
                if soup == None:
                    print('Soup is None!')
                    print(f'URL= {url}')
                    print('--------------------\n')
                    return np.nan

                elif not soup.find('div', {'class': 'col-6 col-sm-4'}):
                    #if not soup.find('span', {'class': 'date'}):
                        print('soup.find is empty!')
                        print(f'URL= {url}')
                        print(soup.text)
                        print('--------------------\n')
                        return np.nan
                    #else:
                        #date1 = soup.find('span', {'class': 'date'}).text.split('ساعت')[0].strip(' -')
                        #return date1

                else:
                    print(f'Error: {e}')

        #------------------ jahannews ------------------
        if source == 'jahannews':
            try:
                # Get date out of the source
                date = ' '.join(soup.find('div', {'id': 'docDiv3Date'}).text.split(' ')[1:4])
                return date

            except Exception as e:
                if soup == None:
                    print('Soup is None!')
                    print(f'URL= {url}')
                    print('--------------------\n')
                    return np.nan

                elif not soup.find('div', {'id': 'docDiv3Date'}):
                        print('soup.find is empty!')
                        print(f'URL= {url}')
                        print(soup.text)
                        print('--------------------\n')
                        return np.nan

                else:
                    print(f'Error: {e}')

        #------------------ mashreghnews ------------------
        if source == 'mashreghnews':
            try:
                # Get date out of the source
                date = soup.find('div', {'class':'col-xs-4 head-date'}).text.split('-')[0].strip(' \n')
                return date

            except Exception as e:
                if soup == None:
                    print('Soup is None!')
                    print(f'URL= {url}')
                    print('--------------------\n')
                    return np.nan

                elif not soup.find('div', {'class':'col-xs-4 head-date'}):
                    #if not soup.find('span', {'class': 'date'}):
                        print('soup.find is empty!')
                        print(f'URL= {url}')
                        print(soup.text)
                        print('--------------------\n')
                        return np.nan
                    #else:
                        #date1 = soup.find('span', {'class': 'date'}).text.split('ساعت')[0].strip(' -')
                        #return date1

                else:
                    print(f'Error: {e}')

        #------------------ iscanews ------------------
        if source == 'iscanews':
            try:
                # Get date out of the source
                date = soup.find('div', {'class':'col-6 col-sm-4 item-date'})\
                .text.split('-')[0].strip(' \n')
                return date

            except Exception as e:
                if soup == None:
                    print('Soup is None!')
                    print(f'URL= {url}')
                    print('--------------------\n')
                    return np.nan

                elif not soup.find('div', {'class':'col-6 col-sm-4 item-date'}):
                    #if not soup.find('span', {'class': 'date'}):
                        print('soup.find is empty!')
                        print(f'URL= {url}')
                        print(soup.text)
                        print('--------------------\n')
                        return np.nan
                    #else:
                        #date1 = soup.find('span', {'class': 'date'}).text.split('ساعت')[0].strip(' -')
                        #return date1

                else:
                    print(f'Error: {e}')
        else:
            print('None of above')
    
    # This method adds extracted dates to corrosponding row in dataframes
    def add_date_to_df(self, data, source):
        print(f'source is : {source}')
        data['date'] =  data[['html', 'url']].apply(lambda x: self.get_date(x, source),axis=1)
        return data
    
    # This method does the proccess of add date for all dataframes
    def add_all_dates(self) -> None:
        print(f'Start adding dates to dataframes...')
        
        for source in tqdm(self.data.keys()):
            print(f'{source} in progress...\n')
        
            self.data[source] = self.parallelize(self.data[source], source)

            
            print(f'{source} dates added successfully :)')
            print(f'---------------------------------------------------------------------\n\n')
    
    # This method parallelizes the process of adding title and date to dataframe
    def parallelize(self, data, source=None, n_cores=8):
        # Split series to n segment
        splitted_data = np.array_split(data, n_cores)
        
        pool = mp.Pool((n_cores))
        
        if source == None:
            data = pd.concat(pool.map(self.add_title_to_df, splitted_data))
        else:
            data = pd.concat(pool.starmap(self.add_date_to_df, list(zip(splitted_data,3* [source]))))
            pool.close()
            pool.join()
        
        return data
    
    

# Preproccess

In [9]:
class Preproccessor:
    def __init__(self, data):
        print(f'Preproccesor called...')
        self.data = data
    
    # This method drops NaNs from dataframes
    def drop_nan_values(self) -> None:
        print(f'Start dropping NaN values...')
        
        try:
            for source in tqdm(self.data.keys()):
                print(f'{source} in progress...\n')
                print(f'Number of rows before dropping NaNs: {self.data[source].shape[0]}')

                self.data[source] = self.data[source].dropna()
                
                print(f'Number of rows after dropping NaNs: {self.data[source].shape[0]}')
                print(f'{source} NaNs dropped successfully :)')
                print(f'---------------------------------------------------------------------\n\n')
                
            print(f'Process of dropping NaNs completed successfully :)')
            
        except Exception as e:
            print(f'Unfortunately Process of dropping NaNs failed :(')
            print(f'Error Raised:{e}')
    
    # This method rmoves duplicate rows from dataframes
    def remove_duplicate_rows(self) -> None:
        print(f'Start removing duplicate rows...')
        
        try:
            for source in tqdm(self.data.keys()):
                print(f'{source} in progress...\n')
                print(f'Number of rows before removing duplicates: {self.data[source].shape[0]}')

                self.data[source] = self.data[source].drop_duplicates()
                
                print(f'Number of rows after removing duplicates: {self.data[source].shape[0]}')
                print(f'{source} Duplicate rows removed successfully :)')
                print(f'---------------------------------------------------------------------\n\n')
                
            print(f'Process of removing duplicates completed successfully :)')
            
        except Exception as e:
            print(f'Unfortunately Process of  removing duplicates failed :(')
            print(f'Error Raised:{e}')
            
    # This method converts dataes to same format
    def convert_date(self):
        pass
    
    # This method merges dataframes with each other
    def merge_data_frames(self) -> None:
        print(f'Start appending dataframes...')
        
        data_to_merge = []
        
        try:
            for source in tqdm(self.data.keys()):
                print(f'{source} in progress...\n')

                data_to_merge.append(self.data[source])
                
                print(f'{source} appended successfully :)')
                print(f'---------------------------------------------------------------------\n\n')
                
            print(f'Process of appending completed successfully :)')
            
            print(f'Start merging dataframes...')
            
            self.data = pd.concat(data_to_merge)
            print(f'Process of merging completed successfully :)')
            
        except Exception as e:
            print(f'Unfortunately Process of  removing duplicates failed :(')
            print(f'Error Raised:{e}')
            
    # This method normalizes the given text
    def text_normalizer(self, text):
        normalizer = Normalizer()
        normalized_text = normalizer.normalize(text)
        return normalized_text
    
    # This method normalizes the given column
    def normalizer(self, data):
        data = data.apply(lambda x: self.text_normalizer(x))
        return data
    
    # main normalizer
    def normalize(self):
        print(f'Start normalizing...')
        self.data.title = self.parallelize(self.data.title, self.normalizer)
        print(f'Normalizing finished')
        
    # This method Tokenizes the given text                     
    def word_Tokenizer(self, text):
        tokenizer = WordTokenizer()
        tokenized_text = tokenizer.tokenize(text)
        return tokenized_text
    
    # This method tokenizes the given column
    def tokenizer(self, data):
        data = data.apply(lambda x: self.word_Tokenizer(x))
        return data
    
    # main tokenizer
    def tokenize(self):
        print(f'Start tokenizing...')
        self.data.title = self.parallelize(self.data.title, self.tokenizer)
        print(f'tokenizing finished')
        
    # This method lemmatize the given text                     
    def token_lemmatizer(self, text):
        lemmatized_text = []
        for token in text:
            lemmatizer = Lemmatizer()
            lemmatized_text.append(lemmatizer.lemmatize(token))
            
        return lemmatized_text
    
    # This method lemmatize the given column
    def lemmatizer(self, data):
        data = data.apply(lambda x: self.token_lemmatizer(x))
        return data
    
    # main lemmatizer
    def lemmatize(self):
        print(f'Start lemmatizing...')
        self.data.title = self.parallelize(self.data.title, self.lemmatizer)
        print(f'lemmatizing finished')
    
    def remove_stopwords(self):
        for row in self.data.title:
            for token in row:
                if token in stopwords_list():
                    row.remove(token)
        print('stopwords removed!')
        
    def parallelize(self, data, func, n_cores=8):
        # Split series to n segment
        splitted_data = np.array_split(data, n_cores)
                          
        pool = mp.Pool((n_cores))
        data = pd.concat(pool.map(func, splitted_data))
        pool.close()
        pool.join()
        
        return data

# Postproccessing

In [10]:
class Postproccessor:
    def __init__(self, data):
        print('Postproccessor is called...')
        self.data = data
    
    def remove_single_chars(self):
        for row in self.data.title:
            for token in row:
                if len(token)==1:
                    row.remove(token)
        print('single words removed!')


    def remove_punctuations(self):
        regex_pattern = re.compile(pattern=r'[^\w\s()]')
        for row in self.data.title:
                for token in row:
                    token = re.sub(pattern=regex_pattern, repl=r'', string=token)

        print('punctuations removed!')

# Trend Extraction

In [15]:
class TrenExtractor:
    def __init__(self, data):
        print('trend extractor is called...')
        self.data =data
        self.overall_trend = ''
        self.daily_trend = ''
        
    def find_overall_trend(self):
        all_words = []
        for row in self.data.title:
            for token in row:
                all_words.append(token)
                
        trend = Counter(all_words).most_common(1)
        print(f'The overall trend is {trend[0][0]} with {trend[0][1]} occurence\n')
        print(f'The top 10 trends are {Counter(all_words).most_common(10)}\n')

    def find_daily_trend(self):
        data.groupby(data['date'])
        """Unfortunately due to a lack of time  to parse dates to correct and same format
        I couldn't complete this part but if dates format were correct, the idea is to
        group by date and in each group count the most frequent word and then return the
        frequent word in each day"""
        

In [12]:
if __name__ == '__main__':
    path = '/home/afsharino/Desktop/Trend-extraction-from-news/datasets/*csv'
    # Read Data
    data_reader = DataReader()
    data_reader.read_dataset(path)
    data_reader.add_all_titles()
    data_reader.add_all_dates()
    
    # Preproccess
    d = data_reader.data.copy()
    
    preproccessor = Preproccessor(d)
    preproccessor.drop_nan_values()
    preproccessor.remove_duplicate_rows()
    preproccessor.merge_data_frames()
    preproccessor.normalize()
    preproccessor.tokenize()
    preproccessor.remove_stopwords()
    
    # Postproccess
    d = preproccessor.data
    postproccessor = Postproccessor(d)
    postproccessor.remove_punctuations()
    postproccessor.remove_single_chars()
    
    # Trend Extraction
    d = postproccessor.data
    trend_extractor = TrenExtractor(d)
    trend_extractor.find_overall_trend()

DataReader called...
Reading datasets...


  0%|          | 0/14 [00:00<?, ?it/s]

data with name asriran saved successfully :)
data with name mehrnews saved successfully :)
data with name khabaronline saved successfully :)
data with name yjcnews saved successfully :)
data with name tasnimnews saved successfully :)
data with name farsnews saved successfully :)
data with name varzesh3 saved successfully :)
data with name isna saved successfully :)
data with name entekhab saved successfully :)
data with name anapress saved successfully :)
data with name shana saved successfully :)
data with name jahannews saved successfully :)
data with name mashreghnews saved successfully :)
data with name iscanews saved successfully :)
Process of reading datasets completed successfully :)
Start adding titles to dataframes...


  0%|          | 0/14 [00:00<?, ?it/s]

asriran in progress...

Soup.title is None!
URL= https://www.asriran.com/fa/news/823655
403 Forbidden
Request forbidden by administrative rules.


--------------------

Soup.title is None!
URL= https://www.asriran.com/fa/news/823655
403 Forbidden
Request forbidden by administrative rules.


--------------------

Soup.title is None!
URL= https://www.asriran.com/fa/news/725405/%D8%A7%D9%86%D8%AA%D8%AE%D8%A7%D8%A8%D8%A7%D8%AA-%D8%AE%D8%A7%D9%86%D9%87-%D8%B3%DB%8C%D9%86%D9%85%D8%A7-%DA%86%D9%87-%D8%B2%D9%85%D8%A7%D9%86%DB%8C-%D8%A8%D8%B1%DA%AF%D8%B2%D8%A7%D8%B1-%D9%85%DB%8C%E2%80%8C%D8%B4%D9%88%D8%AF
403 Forbidden
Request forbidden by administrative rules.


--------------------

Soup.title is None!
URL= https://www.asriran.com/fa/news/725405/%D8%A7%D9%86%D8%AA%D8%AE%D8%A7%D8%A8%D8%A7%D8%AA-%D8%AE%D8%A7%D9%86%D9%87-%D8%B3%DB%8C%D9%86%D9%85%D8%A7-%DA%86%D9%87-%D8%B2%D9%85%D8%A7%D9%86%DB%8C-%D8%A8%D8%B1%DA%AF%D8%B2%D8%A7%D8%B1-%D9%85%DB%8C%E2%80%8C%D8%B4%D9%88%D8%AF
403 Forbidden
Request for


Soup.title is None!
URL= https://ana.press/fa/news/501207/%DB%B6-%D9%86%DA%A9%D8%AA%D9%87-%D8%AF%D8%B1-%D9%85%D9%88%D8%B1%D8%AF-%D9%88%D8%B1%D8%B2%D8%B4-%D8%AF%D8%B1-%D9%87%D9%88%D8%A7%DB%8C-%DA%AF%D8%B1%D9%85
403 Forbidden
Request forbidden by administrative rules.


--------------------

Soup.title is None!
URL= https://ana.press/fa/news/501207/%DB%B6-%D9%86%DA%A9%D8%AA%D9%87-%D8%AF%D8%B1-%D9%85%D9%88%D8%B1%D8%AF-%D9%88%D8%B1%D8%B2%D8%B4-%D8%AF%D8%B1-%D9%87%D9%88%D8%A7%DB%8C-%DA%AF%D8%B1%D9%85
403 Forbidden
Request forbidden by administrative rules.


--------------------

Soup.title is None!
URL= https://ana.press/fa/news/659538/%D8%AF%D8%B3%D8%AA%D9%88%D8%B1-%D8%B1%D8%A6%DB%8C%D8%B3%DB%8C-%D8%A8%D8%B1%D8%A7%DB%8C-%DA%A9%D9%86%D8%A7%D8%B1-%DA%AF%D8%B0%D8%A7%D8%B4%D8%AA%D9%86-%D9%85%D8%A7%D9%86%D8%B9%E2%80%8C%D8%AA%D8%B1%D8%A7%D8%B4%D8%A7%D9%86-%D8%AA%D8%AD%D9%82%D9%82-%D8%AC%D9%87%D8%B4-%D8%B3%D8%A7%D8%AE%D8%AA-%D9%85%D8%B3%DA%A9%D9%86
403 Forbidden
Request forbidden by administrat

  0%|          | 0/14 [00:00<?, ?it/s]

asriran in progress...

source is : asriran
soup.find_all is empty!
URL= https://www.asriran.com/fa/news/823655
403 Forbidden
Request forbidden by administrative rules.


--------------------

soup.find_all is empty!
URL= https://www.asriran.com/fa/news/823655
403 Forbidden
Request forbidden by administrative rules.


--------------------

source is : asriran
soup.find_all is empty!
URL= https://www.asriran.com/fa/news/725405/%D8%A7%D9%86%D8%AA%D8%AE%D8%A7%D8%A8%D8%A7%D8%AA-%D8%AE%D8%A7%D9%86%D9%87-%D8%B3%DB%8C%D9%86%D9%85%D8%A7-%DA%86%D9%87-%D8%B2%D9%85%D8%A7%D9%86%DB%8C-%D8%A8%D8%B1%DA%AF%D8%B2%D8%A7%D8%B1-%D9%85%DB%8C%E2%80%8C%D8%B4%D9%88%D8%AF
403 Forbidden
Request forbidden by administrative rules.


--------------------

soup.find_all is empty!
URL= https://www.asriran.com/fa/news/725405/%D8%A7%D9%86%D8%AA%D8%AE%D8%A7%D8%A8%D8%A7%D8%AA-%D8%AE%D8%A7%D9%86%D9%87-%D8%B3%DB%8C%D9%86%D9%85%D8%A7-%DA%86%D9%87-%D8%B2%D9%85%D8%A7%D9%86%DB%8C-%D8%A8%D8%B1%DA%AF%D8%B2%D8%A7%D8%B1-%D9%85%DB

  0%|          | 0/14 [00:00<?, ?it/s]

asriran in progress...

Number of rows before dropping NaNs: 9
Number of rows after dropping NaNs: 4
asriran NaNs dropped successfully :)
---------------------------------------------------------------------


mehrnews in progress...

Number of rows before dropping NaNs: 9
Number of rows after dropping NaNs: 9
mehrnews NaNs dropped successfully :)
---------------------------------------------------------------------


khabaronline in progress...

Number of rows before dropping NaNs: 9
Number of rows after dropping NaNs: 9
khabaronline NaNs dropped successfully :)
---------------------------------------------------------------------


yjcnews in progress...

Number of rows before dropping NaNs: 9
Number of rows after dropping NaNs: 9
yjcnews NaNs dropped successfully :)
---------------------------------------------------------------------


tasnimnews in progress...

Number of rows before dropping NaNs: 9
Number of rows after dropping NaNs: 9
tasnimnews NaNs dropped successfully :)
----

  0%|          | 0/14 [00:00<?, ?it/s]

asriran in progress...

Number of rows before removing duplicates: 4
Number of rows after removing duplicates: 3
asriran Duplicate rows removed successfully :)
---------------------------------------------------------------------


mehrnews in progress...

Number of rows before removing duplicates: 9
Number of rows after removing duplicates: 6
mehrnews Duplicate rows removed successfully :)
---------------------------------------------------------------------


khabaronline in progress...

Number of rows before removing duplicates: 9
Number of rows after removing duplicates: 7
khabaronline Duplicate rows removed successfully :)
---------------------------------------------------------------------


yjcnews in progress...

Number of rows before removing duplicates: 9
Number of rows after removing duplicates: 7
yjcnews Duplicate rows removed successfully :)
---------------------------------------------------------------------


tasnimnews in progress...

Number of rows before removing du

  0%|          | 0/14 [00:00<?, ?it/s]

asriran in progress...

asriran appended successfully :)
---------------------------------------------------------------------


mehrnews in progress...

mehrnews appended successfully :)
---------------------------------------------------------------------


khabaronline in progress...

khabaronline appended successfully :)
---------------------------------------------------------------------


yjcnews in progress...

yjcnews appended successfully :)
---------------------------------------------------------------------


tasnimnews in progress...

tasnimnews appended successfully :)
---------------------------------------------------------------------


farsnews in progress...

farsnews appended successfully :)
---------------------------------------------------------------------


varzesh3 in progress...

varzesh3 appended successfully :)
---------------------------------------------------------------------


isna in progress...

isna appended successfully :)
------------------------

Note: Due to a lack of time I just select 20 rows from each dataset to just show you the output but the code works properly until 772 rows and after that there are some lines you should skip because of problem in csv files.