# View SQL Code of enwiki-latest-category.sql

## TL;DR

### This is step X toward the goal of:

- Get Wikipedia categories into database:
- (Create table simply linking page_id and top-level wikipedia category)

### Questions driving exploration:

- After opening this sql command file is it ready to export to postgres?
- What is the schema of enwiki-latest-category.sql?
- What do its records generally look like?

### Answers:

- **Ready to port?**
    - Yes, as far as I can tell.
- **What is the schema?**
    - 1 table
        - category
    - 5 columns
        - cat_id cat_title cat_pages cat_subcats cat_files
    - 2,209,005 rows
- **What do the records look like?**
    - One example
        - (2,'Unprintworthy_redirects',1494044,20,0)

## SQL Code - Explored via Python

### Get SQL into Python

#### Imports, current working directory

In [3]:
import os
from pathlib import Path
import pandas as pd
import psycopg2
import sqlalchemy

In [4]:
home_path = str(Path.home())
os.chdir(home_path + '/git/predwikt/notebooks')

#### Check row count

In [5]:
f = open(
    '../data/raw/enwiki-latest-category.sql', 
    mode='r', 
    encoding='UTF_8', 
    errors='backslashreplace')

ct = 0
for line in f: 
    ct+=1
print(ct)
f.close()

164


#### Populate rows into a list of strings

In [6]:
f = open(
    '../data/raw/enwiki-latest-category.sql', 
    mode='r', 
    encoding='UTF_8', 
    errors='backslashreplace')

l = []
for line in f: 
    l.append(line)
print(ct)
f.close()

164


### Peek at SQL code using Python

#### FIRST 10 ROWS

In [7]:
for i in range(0,10):
    print(l[i])

-- MySQL dump 10.19  Distrib 10.3.31-MariaDB, for debian-linux-gnu (x86_64)

--

-- Host: 10.64.48.13    Database: enwiki

-- ------------------------------------------------------

-- Server version	10.4.21-MariaDB-log



/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;

/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;

/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;

/*!40101 SET NAMES utf8mb4 */;



#### ROWS 10+

In [8]:
for i in range(10,20):
    print(l[i])

/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;

/*!40103 SET TIME_ZONE='+00:00' */;

/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */;

/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;

/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;

/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;



--

-- Table structure for table `category`

--



#### ROWS 20+

In [9]:
for i in range(20,30):
    print(l[i])



DROP TABLE IF EXISTS `category`;

/*!40101 SET @saved_cs_client     = @@character_set_client */;

/*!40101 SET character_set_client = utf8 */;

CREATE TABLE `category` (

  `cat_id` int(10) unsigned NOT NULL AUTO_INCREMENT,

  `cat_title` varbinary(255) NOT NULL DEFAULT '',

  `cat_pages` int(11) NOT NULL DEFAULT 0,

  `cat_subcats` int(11) NOT NULL DEFAULT 0,

  `cat_files` int(11) NOT NULL DEFAULT 0,



#### ROWS 30+

In [19]:
for i in range(30,40):
    print(l[i])

  PRIMARY KEY (`cat_id`),

  UNIQUE KEY `cat_title` (`cat_title`),

  KEY `cat_pages` (`cat_pages`)

) ENGINE=InnoDB AUTO_INCREMENT=248633369 DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED;

/*!40101 SET character_set_client = @saved_cs_client */;



--

-- Dumping data for table `category`

--





#### ROW 40

In [10]:
for i in range(40,41):
    print(l[i])

/*!40000 ALTER TABLE `category` DISABLE KEYS */;



#### ROW 41 (EXCERPT)

In [11]:
print(l[41][:1000])

INSERT INTO `category` VALUES (2,'Unprintworthy_redirects',1494044,20,0),(3,'Computer_storage_devices',90,11,0),(7,'Unknown-importance_Animation_articles',280,22,0),(8,'Low-importance_Animation_articles',13927,22,0),(9,'Vietnam_stubs',304,10,0),(10,'Rivers_of_Vietnam',104,3,0),(11,'Quang_Binh_Province',0,0,0),(12,'All_articles_with_unsourced_statements',442856,0,0),(14,'Wikipedia_articles_needing_clarification',183,183,0),(15,'Articles_needing_additional_references_from_January_2008',1325,0,0),(16,'Comedy',104,31,0),(17,'Sociolinguistics',252,29,0),(18,'Figures_of_speech',134,13,0),(20,'NASCAR_teams',109,3,0),(21,'Muhammad_Ali',18,4,0),(22,'Politics_and_government_work_group_articles',186621,4,0),(23,'Wikipedia_requested_photographs_of_politicians_and_government-people',10927,1,0),(24,'Stub-Class_biography_(politics_and_government)_articles',97011,0,0),(26,'Stub-Class_biography_articles',1023944,10,0),(27,'Unassessed_biography_articles',72633,10,0),(29,'High-importance_Animation_articl

#### ROWS 152+

In [18]:
for i in range(152,164):
    print(l[i])

/*!40000 ALTER TABLE `category` ENABLE KEYS */;

/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;



/*!40101 SET SQL_MODE=@OLD_SQL_MODE */;

/*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */;

/*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */;

/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;

/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;

/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;

/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;



-- Dump completed on 2021-11-20 10:09:38

