In [1]:
# -*- coding: utf-8 -*-
"""
Automatically generated by Colab.
"""
!pip install pandas numpy matplotlib python-docx xlsxwriter xlrd openpyxl scipy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from docx import Document
from docx.shared import Inches
from google.colab import files
import zipfile
import os
from scipy.optimize import minimize

uploaded = files.upload()

if not os.path.exists('plots'):
    os.makedirs('plots')
if not os.path.exists('excels'):
    os.makedirs('excels')

summary_data = []

def calculate_h_index(y_data):
    sorted_cites = np.sort(y_data)[::-1]
    h_index = np.max(np.where(sorted_cites >= np.arange(1, len(sorted_cites) + 1))[0]) + 1
    return h_index

def calculate_abc(M, N, h):
    a = (M * h**2) / (M * N - (M + N) * h)
    b = (M * N * (M - h) * (N - h)) * (h / (M * N - (M + N) * h))**2
    c = (N * h**2) / (M * N - (M + N) * h)
    return a, b, c

def calculate_rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred) ** 2))

def alternative_model(params, x_data, phi, N):
    p = params[0]
    fitted_values = ((1 - p) / p) * (phi / N) * (((x_data / (x_data - p)) * ((x_data + 1) / (x_data + 1 - p))) ** (N / (N - p)) - 1)
    return fitted_values

def grid_search_optimization(x_data, y_data, phi, N, num_points=100):
    p_values = np.linspace(-1000, 1000, num_points)  # I will Avoid 0 and 1 to prevent division by zero
    best_p = None
    best_rmse = float('inf')

    for p in p_values:
        fitted_values = alternative_model([p], x_data, phi, N)
        rmse = calculate_rmse(y_data, fitted_values)

        if rmse < best_rmse:
            best_rmse = rmse
            best_p = p

    return best_p, best_rmse

def local_optimization(x_data, y_data, phi, N, initial_guess):
    def objective(params):
        fitted_values = alternative_model(params, x_data, phi, N)
        return calculate_rmse(y_data, fitted_values)

    result = minimize(objective, initial_guess, method='L-BFGS-B')  # No bounds specified
    return result.x[0], result.fun

def zip_dir(dir_path, ziph):
    for root, dirs, files in os.walk(dir_path):
        for file in files:
            ziph.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), os.path.join(dir_path, '..')))

for file_name in uploaded.keys():
    xls = pd.ExcelFile(file_name)

    doc = Document()
    doc.add_heading('Plots for Each Sheet', 0)

    for sheet_name in xls.sheet_names:
        df = pd.read_excel(xls, sheet_name=sheet_name)
        x_data = df['Serial Number'].values
        y_data = df['Cites'].values

        M = np.max(y_data)
        N = len(y_data)
        phi = np.sum(y_data)
        h = calculate_h_index(y_data)

        a, b, c = calculate_abc(M, N, h)
        fitted_values = (b / (x_data + c)) - a
        rmse_original = calculate_rmse(y_data, fitted_values)

        # grid search
        p_initial, rmse_grid = grid_search_optimization(x_data, y_data, phi, N, num_points=200)

        #local optimization
        p_opt, rmse_alt = local_optimization(x_data, y_data, phi, N, [p_initial])
        fitted_values_alt = alternative_model([p_opt], x_data, phi, N)

        summary_data.append({
            'File': file_name,
            'Sheet': sheet_name,
            'RMSE_Alternative_Model': rmse_alt,
            'RMSE_CE00h': rmse_original,
            'p_Alternative_Model': p_opt
        })

        print(f"File: {file_name}, Sheet: {sheet_name}, p_opt: {p_opt}, RMSE_Alternative_Model: {rmse_alt}")

        plt.figure(figsize=(10, 6))
        plt.scatter(x_data, y_data, label='Data Points')
        plt.plot(x_data, fitted_values, label='CE00h')
        plt.plot(x_data, fitted_values_alt, label='Alternative Model')
        plt.title(f'Fitted Models for {sheet_name}')
        plt.xlabel('Serial Number')
        plt.ylabel('Cites')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()

        plt.savefig(f'plots/{file_name}_{sheet_name}.png')
        plt.close()

        doc.add_heading(f'Plot for {sheet_name}', level=1)
        doc.add_picture(f'plots/{file_name}_{sheet_name}.png', width=Inches(6))

    doc.save(f'excels/{file_name}_plots.docx')

summary_df = pd.DataFrame(summary_data)
summary_df.to_excel('summary.xlsx', index=False)

with zipfile.ZipFile('output.zip', 'w') as zipf:
    zipf.write('summary.xlsx')
    zip_dir('plots', zipf)
    zip_dir('excels', zipf)

files.download('output.zip')


Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting xlsxwriter
  Downloading XlsxWriter-3.2.0-py3-none-any.whl.metadata (2.6 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading XlsxWriter-3.2.0-py3-none-any.whl (159 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.9/159.9 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xlsxwriter, python-docx
Successfully installed python-docx-1.1.2 xlsxwriter-3.2.0


Saving 5authors.xlsx to 5authors.xlsx


  fitted_values = ((1 - p) / p) * (phi / N) * (((x_data / (x_data - p)) * ((x_data + 1) / (x_data + 1 - p))) ** (N / (N - p)) - 1)
  fitted_values = ((1 - p) / p) * (phi / N) * (((x_data / (x_data - p)) * ((x_data + 1) / (x_data + 1 - p))) ** (N / (N - p)) - 1)
  return np.sqrt(np.mean((y_true - y_pred) ** 2))


File: 5authors.xlsx, Sheet: ECS21, p_opt: -47.26733890224128, RMSE_Alternative_Model: 42.96918765938945


  fitted_values = ((1 - p) / p) * (phi / N) * (((x_data / (x_data - p)) * ((x_data + 1) / (x_data + 1 - p))) ** (N / (N - p)) - 1)


File: 5authors.xlsx, Sheet: Life21, p_opt: -5.723985262928835, RMSE_Alternative_Model: 95.10723220542252


  fitted_values = ((1 - p) / p) * (phi / N) * (((x_data / (x_data - p)) * ((x_data + 1) / (x_data + 1 - p))) ** (N / (N - p)) - 1)
  fitted_values = ((1 - p) / p) * (phi / N) * (((x_data / (x_data - p)) * ((x_data + 1) / (x_data + 1 - p))) ** (N / (N - p)) - 1)
  return np.sqrt(np.mean((y_true - y_pred) ** 2))


File: 5authors.xlsx, Sheet: Med21, p_opt: -54.925587088723155, RMSE_Alternative_Model: 37.97550076482855


  fitted_values = ((1 - p) / p) * (phi / N) * (((x_data / (x_data - p)) * ((x_data + 1) / (x_data + 1 - p))) ** (N / (N - p)) - 1)
  return np.sqrt(np.mean((y_true - y_pred) ** 2))


File: 5authors.xlsx, Sheet: Phy21, p_opt: -28.886773291180432, RMSE_Alternative_Model: 12.099319695955753


  fitted_values = ((1 - p) / p) * (phi / N) * (((x_data / (x_data - p)) * ((x_data + 1) / (x_data + 1 - p))) ** (N / (N - p)) - 1)
  fitted_values = ((1 - p) / p) * (phi / N) * (((x_data / (x_data - p)) * ((x_data + 1) / (x_data + 1 - p))) ** (N / (N - p)) - 1)
  return np.sqrt(np.mean((y_true - y_pred) ** 2))


File: 5authors.xlsx, Sheet: Soc21, p_opt: -35.56967391049007, RMSE_Alternative_Model: 165.67857137438577


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>