In [1]:
import pandas as pd
import random
import string
from datetime import datetime, timedelta

def random_string(length=100):
    """Generate a random string of fixed length."""
    return ''.join(random.choice(string.ascii_lowercase) for i in range(length))

def generate_serp_data(keywords, days, max_rank=10):
    """
    Generate fake SERP data for a set of keywords over a given number of days.

    :param keywords: List of keywords.
    :param days: Number of days to simulate data for.
    :param max_rank: Maximum rank for a keyword.
    :return: DataFrame with fake SERP data.
    """
    data = []
    start_date = datetime(2023, 1, 1)  # starting from January 1, 2023

    for day in range(days):
        current_date = start_date + timedelta(days=day)
        date_str = current_date.strftime('%Y-%m-%d')
        for keyword in keywords:
            for rank in range(1, max_rank + 1):
                entry = {
                    'date': date_str,
                    'keyword': keyword,
                    'rank': rank,
                    'meta_description': random_string(160),
                    'featured_snippet': random_string(50) if random.random() > 0.8 else None
                }
                data.append(entry)

    return pd.DataFrame(data)

# Example usage
keywords = ['seo tips', 'best seo practices', 'how to improve seo', 'seo trends']
serp_data = generate_serp_data(keywords, 30)  # 30 days of data

serp_data.head()


Unnamed: 0,date,keyword,rank,meta_description,featured_snippet
0,2023-01-01,seo tips,1,poyzmngdclneelvoegnolggcjanpwfrqrvbjbekyvhtnbe...,
1,2023-01-01,seo tips,2,rxoriiksgoofymruwkhoqgyirrbquagtlhqaoylenntvqk...,
2,2023-01-01,seo tips,3,ryouzhihrdmayhbgazjzgqnlophdftjncoxijdwksnskrx...,
3,2023-01-01,seo tips,4,sqzsaalsohsxkmmrbitepeceaojxnyfctkfgjyxbhdkqzc...,
4,2023-01-01,seo tips,5,qnobgnwefhftszuavjlxvvddovbneaieflltujgatloiqf...,


In [2]:
serp_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   date              1200 non-null   object
 1   keyword           1200 non-null   object
 2   rank              1200 non-null   int64 
 3   meta_description  1200 non-null   object
 4   featured_snippet  248 non-null    object
dtypes: int64(1), object(4)
memory usage: 47.0+ KB


In [3]:
#data to csv
serp_data.to_csv('serp_data.csv', index=False)

In [11]:
#import label encoder
from sklearn.preprocessing import LabelEncoder

# Generating a test dataset using only keywords from the training set
def generate_random_text(length=50):
    """Generate a random string of fixed length."""
    return ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(length))


# Selecting keywords from the original dataset
unique_keywords = serp_data['keyword'].unique()

# Generating test data using the unique keywords from the training set
test_data = []
days_test = range(1, 8)  # Simulating for one week

for day in days_test:
    for keyword in unique_keywords:
        for rank in range(1, 11):  # Assuming ranks from 1 to 10
            entry = {
                'date': f'2023-04-{day:02d}',
                'keyword': keyword,
                'rank': rank,  # Actual rank, but in real-world, this would be unknown for predictions
                'meta_description': generate_random_text(160),
                'featured_snippet': generate_random_text(50) if random.random() > 0.5 else ''
            }
            test_data.append(entry)

# Creating a DataFrame
test_df = pd.DataFrame(test_data)

#save test data to csv
test_df.to_csv('test_data.csv', index=False)
